Subversion Repositories planix.SVN

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 - 1
#include	"u.h"
2
#include	"../port/lib.h"
3
#include	"mem.h"
4
#include	"dat.h"
5
#include	"fns.h"
6
#include	"../port/error.h"
7
 
8
#include	"ip.h"
9
 
10
enum
11
{
12
	QMAX		= 64*1024-1,
13
	IP_TCPPROTO	= 6,
14
 
15
	TCP4_IPLEN	= 8,
16
	TCP4_PHDRSIZE	= 12,
17
	TCP4_HDRSIZE	= 20,
18
	TCP4_TCBPHDRSZ	= 40,
19
	TCP4_PKT	= TCP4_IPLEN+TCP4_PHDRSIZE,
20
 
21
	TCP6_IPLEN	= 0,
22
	TCP6_PHDRSIZE	= 40,
23
	TCP6_HDRSIZE	= 20,
24
	TCP6_TCBPHDRSZ	= 60,
25
	TCP6_PKT	= TCP6_IPLEN+TCP6_PHDRSIZE,
26
 
27
	TcptimerOFF	= 0,
28
	TcptimerON	= 1,
29
	TcptimerDONE	= 2,
30
	MAX_TIME 	= (1<<20),	/* Forever */
31
	TCP_ACK		= 50,		/* Timed ack sequence in ms */
32
	MAXBACKMS	= 9*60*1000,	/* longest backoff time (ms) before hangup */
33
 
34
	URG		= 0x20,		/* Data marked urgent */
35
	ACK		= 0x10,		/* Acknowledge is valid */
36
	PSH		= 0x08,		/* Whole data pipe is pushed */
37
	RST		= 0x04,		/* Reset connection */
38
	SYN		= 0x02,		/* Pkt. is synchronise */
39
	FIN		= 0x01,		/* Start close down */
40
 
41
	EOLOPT		= 0,
42
	NOOPOPT		= 1,
43
	MSSOPT		= 2,
44
	MSS_LENGTH	= 4,		/* Maximum segment size */
45
	WSOPT		= 3,
46
	WS_LENGTH	= 3,		/* Bits to scale window size by */
47
	MSL2		= 10,
48
	MSPTICK		= 50,		/* Milliseconds per timer tick */
49
	DEF_MSS		= 1460,		/* Default maximum segment */
50
	DEF_MSS6	= 1280,		/* Default maximum segment (min) for v6 */
51
	DEF_RTT		= 500,		/* Default round trip */
52
	DEF_KAT		= 120000,	/* Default time (ms) between keep alives */
53
	TCP_LISTEN	= 0,		/* Listen connection */
54
	TCP_CONNECT	= 1,		/* Outgoing connection */
55
	SYNACK_RXTIMER	= 250,		/* ms between SYNACK retransmits */
56
 
57
	TCPREXMTTHRESH	= 3,		/* dupack threshhold for rxt */
58
 
59
	FORCE		= 1,
60
	CLONE		= 2,
61
	RETRAN		= 4,
62
	ACTIVE		= 8,
63
	SYNACK		= 16,
64
 
65
	LOGAGAIN	= 3,
66
	LOGDGAIN	= 2,
67
 
68
	Closed		= 0,		/* Connection states */
69
	Listen,
70
	Syn_sent,
71
	Syn_received,
72
	Established,
73
	Finwait1,
74
	Finwait2,
75
	Close_wait,
76
	Closing,
77
	Last_ack,
78
	Time_wait,
79
 
80
	Maxlimbo	= 1000,		/* maximum procs waiting for response to SYN ACK */
81
	NLHT		= 256,		/* hash table size, must be a power of 2 */
82
	LHTMASK		= NLHT-1,
83
 
84
	/*
85
	 * window is 64kb * 2ⁿ
86
	 * these factors determine the ultimate bandwidth-delay product.
87
	 * 64kb * 2⁵ = 2mb, or 2× overkill for 100mbps * 70ms.
88
	 */
89
	Maxqscale	= 4,		/* maximum queuing scale */
90
	Defadvscale	= 4,		/* default advertisement */
91
};
92
 
93
/* Must correspond to the enumeration above */
94
char *tcpstates[] =
95
{
96
	"Closed", 	"Listen", 	"Syn_sent", "Syn_received",
97
	"Established", 	"Finwait1",	"Finwait2", "Close_wait",
98
	"Closing", 	"Last_ack", 	"Time_wait"
99
};
100
 
101
typedef struct Tcptimer Tcptimer;
102
struct Tcptimer
103
{
104
	Tcptimer	*next;
105
	Tcptimer	*prev;
106
	Tcptimer	*readynext;
107
	int	state;
108
	int	start;
109
	int	count;
110
	void	(*func)(void*);
111
	void	*arg;
112
};
113
 
114
/*
115
 *  v4 and v6 pseudo headers used for
116
 *  checksuming tcp
117
 */
118
typedef struct Tcp4hdr Tcp4hdr;
119
struct Tcp4hdr
120
{
121
	uchar	vihl;		/* Version and header length */
122
	uchar	tos;		/* Type of service */
123
	uchar	length[2];	/* packet length */
124
	uchar	id[2];		/* Identification */
125
	uchar	frag[2];	/* Fragment information */
126
	uchar	Unused;
127
	uchar	proto;
128
	uchar	tcplen[2];
129
	uchar	tcpsrc[4];
130
	uchar	tcpdst[4];
131
	/* same as v6 from here on */
132
	uchar	tcpsport[2];
133
	uchar	tcpdport[2];
134
	uchar	tcpseq[4];
135
	uchar	tcpack[4];
136
	uchar	tcpflag[2];
137
	uchar	tcpwin[2];
138
	uchar	tcpcksum[2];
139
	uchar	tcpurg[2];
140
	/* Options segment */
141
	uchar	tcpopt[1];
142
};
143
 
144
typedef struct Tcp6hdr Tcp6hdr;
145
struct Tcp6hdr
146
{
147
	uchar	vcf[4];
148
	uchar	ploadlen[2];
149
	uchar	proto;
150
	uchar	ttl;
151
	uchar	tcpsrc[IPaddrlen];
152
	uchar	tcpdst[IPaddrlen];
153
	/* same as v4 from here on */
154
	uchar	tcpsport[2];
155
	uchar	tcpdport[2];
156
	uchar	tcpseq[4];
157
	uchar	tcpack[4];
158
	uchar	tcpflag[2];
159
	uchar	tcpwin[2];
160
	uchar	tcpcksum[2];
161
	uchar	tcpurg[2];
162
	/* Options segment */
163
	uchar	tcpopt[1];
164
};
165
 
166
/*
167
 *  this represents the control info
168
 *  for a single packet.  It is derived from
169
 *  a packet in ntohtcp{4,6}() and stuck into
170
 *  a packet in htontcp{4,6}().
171
 */
172
typedef struct Tcp Tcp;
173
struct	Tcp
174
{
175
	ushort	source;
176
	ushort	dest;
177
	ulong	seq;
178
	ulong	ack;
179
	uchar	flags;
180
	uchar	update;
181
	ushort	ws;	/* window scale option */
182
	ulong	wnd;	/* prescaled window*/
183
	ushort	urg;
184
	ushort	mss;	/* max segment size option (if not zero) */
185
	ushort	len;	/* size of data */
186
};
187
 
188
/*
189
 *  this header is malloc'd to thread together fragments
190
 *  waiting to be coalesced
191
 */
192
typedef struct Reseq Reseq;
193
struct Reseq
194
{
195
	Reseq	*next;
196
	Tcp	seg;
197
	Block	*bp;
198
	ushort	length;
199
};
200
 
201
/*
202
 *  the qlock in the Conv locks this structure
203
 */
204
typedef struct Tcpctl Tcpctl;
205
struct Tcpctl
206
{
207
	uchar	state;			/* Connection state */
208
	uchar	type;			/* Listening or active connection */
209
	uchar	code;			/* Icmp code */
210
	struct {
211
		ulong	una;		/* Unacked data pointer */
212
		ulong	nxt;		/* Next sequence expected */
213
		ulong	ptr;		/* Data pointer */
214
		ulong	wnd;		/* Tcp send window */
215
		ulong	urg;		/* Urgent data pointer */
216
		ulong	wl2;
217
		uint	scale;		/* how much to right shift window */
218
					/* in xmitted packets */
219
		/* to implement tahoe and reno TCP */
220
		ulong	dupacks;	/* number of duplicate acks rcvd */
221
		ulong	partialack;
222
		int	recovery;	/* loss recovery flag */
223
		int	retransmit;	/* retransmit 1 packet @ una flag */
224
		int	rto;
225
		ulong	rxt;		/* right window marker for recovery */
226
					/* "recover" rfc3782 */
227
	} snd;
228
	struct {
229
		ulong	nxt;		/* Receive pointer to next uchar slot */
230
		ulong	wnd;		/* Receive window incoming */
231
		ulong	wsnt;		/* Last wptr sent.  important to */
232
					/* track for large bdp */
233
		ulong	wptr;
234
		ulong	urg;		/* Urgent pointer */
235
		ulong	ackptr;		/* last acked sequence */
236
		int	blocked;
237
		uint	scale;		/* how much to left shift window in */
238
					/* rcv'd packets */
239
	} rcv;
240
	ulong	iss;			/* Initial sequence number */
241
	ulong	cwind;			/* Congestion window */
242
	ulong	abcbytes;		/* appropriate byte counting rfc 3465 */
243
	uint	scale;			/* desired snd.scale */
244
	ulong	ssthresh;		/* Slow start threshold */
245
	int	resent;			/* Bytes just resent */
246
	int	irs;			/* Initial received squence */
247
	ushort	mss;			/* Maximum segment size */
248
	int	rerecv;			/* Overlap of data rerecevived */
249
	ulong	window;			/* Our receive window (queue) */
250
	uint	qscale;			/* Log2 of our receive window (queue) */
251
	uchar	backoff;		/* Exponential backoff counter */
252
	int	backedoff;		/* ms we've backed off for rexmits */
253
	uchar	flags;			/* State flags */
254
	Reseq	*reseq;			/* Resequencing queue */
255
	int	nreseq;
256
	int	reseqlen;
257
	Tcptimer	timer;			/* Activity timer */
258
	Tcptimer	acktimer;		/* Acknowledge timer */
259
	Tcptimer	rtt_timer;		/* Round trip timer */
260
	Tcptimer	katimer;		/* keep alive timer */
261
	ulong	rttseq;			/* Round trip sequence */
262
	int	srtt;			/* Smoothed round trip */
263
	int	mdev;			/* Mean deviation of round trip */
264
	int	kacounter;		/* count down for keep alive */
265
	uint	sndsyntime;		/* time syn sent */
266
	ulong	time;			/* time Finwait2 or Syn_received was sent */
267
	ulong	timeuna;		/* snd.una when time was set */
268
	int	nochecksum;		/* non-zero means don't send checksums */
269
	int	flgcnt;			/* number of flags in the sequence (FIN,SEQ) */
270
 
271
	union {
272
		Tcp4hdr	tcp4hdr;
273
		Tcp6hdr	tcp6hdr;
274
	} protohdr;		/* prototype header */
275
};
276
 
277
/*
278
 *  New calls are put in limbo rather than having a conversation structure
279
 *  allocated.  Thus, a SYN attack results in lots of limbo'd calls but not
280
 *  any real Conv structures mucking things up.  Calls in limbo rexmit their
281
 *  SYN ACK every SYNACK_RXTIMER ms up to 4 times, i.e., they disappear after 1 second.
282
 *
283
 *  In particular they aren't on a listener's queue so that they don't figure
284
 *  in the input queue limit.
285
 *
286
 *  If 1/2 of a T3 was attacking SYN packets, we'ld have a permanent queue
287
 *  of 70000 limbo'd calls.  Not great for a linear list but doable.  Therefore
288
 *  there is no hashing of this list.
289
 */
290
typedef struct Limbo Limbo;
291
struct Limbo
292
{
293
	Limbo	*next;
294
 
295
	uchar	laddr[IPaddrlen];
296
	uchar	raddr[IPaddrlen];
297
	ushort	lport;
298
	ushort	rport;
299
	ulong	irs;		/* initial received sequence */
300
	ulong	iss;		/* initial sent sequence */
301
	ushort	mss;		/* mss from the other end */
302
	ushort	rcvscale;	/* how much to scale rcvd windows */
303
	ushort	sndscale;	/* how much to scale sent windows */
304
	ulong	lastsend;	/* last time we sent a synack */
305
	uchar	version;	/* v4 or v6 */
306
	uchar	rexmits;	/* number of retransmissions */
307
};
308
 
309
int	tcp_irtt = DEF_RTT;	/* Initial guess at round trip time */
310
 
311
enum {
312
	/* MIB stats */
313
	MaxConn,
314
	Mss,
315
	ActiveOpens,
316
	PassiveOpens,
317
	EstabResets,
318
	CurrEstab,
319
	InSegs,
320
	OutSegs,
321
	RetransSegs,
322
	RetransSegsSent,
323
	RetransTimeouts,
324
	InErrs,
325
	OutRsts,
326
 
327
	/* non-MIB stats */
328
	CsumErrs,
329
	HlenErrs,
330
	LenErrs,
331
	Resequenced,
332
	OutOfOrder,
333
	ReseqBytelim,
334
	ReseqPktlim,
335
	Delayack,
336
	Wopenack,
337
 
338
	Recovery,
339
	RecoveryDone,
340
	RecoveryRTO,
341
	RecoveryNoSeq,
342
	RecoveryCwind,
343
	RecoveryPA,
344
 
345
	Nstats
346
};
347
 
348
static char *statnames[Nstats] =
349
{
350
[MaxConn]	"MaxConn",
351
[Mss]		"MaxSegment",
352
[ActiveOpens]	"ActiveOpens",
353
[PassiveOpens]	"PassiveOpens",
354
[EstabResets]	"EstabResets",
355
[CurrEstab]	"CurrEstab",
356
[InSegs]	"InSegs",
357
[OutSegs]	"OutSegs",
358
[RetransSegs]	"RetransSegs",
359
[RetransSegsSent]	"RetransSegsSent",
360
[RetransTimeouts]	"RetransTimeouts",
361
[InErrs]	"InErrs",
362
[OutRsts]	"OutRsts",
363
[CsumErrs]	"CsumErrs",
364
[HlenErrs]	"HlenErrs",
365
[LenErrs]	"LenErrs",
366
[OutOfOrder]	"OutOfOrder",
367
[Resequenced]	"Resequenced",
368
[ReseqBytelim]	"ReseqBytelim",
369
[ReseqPktlim]	"ReseqPktlim",
370
[Delayack]	"Delayack",
371
[Wopenack]	"Wopenack",
372
 
373
[Recovery]	"Recovery",
374
[RecoveryDone]	"RecoveryDone",
375
[RecoveryRTO]	"RecoveryRTO",
376
 
377
[RecoveryNoSeq]	"RecoveryNoSeq",
378
[RecoveryCwind]	"RecoveryCwind",
379
[RecoveryPA]	"RecoveryPA",
380
};
381
 
382
typedef struct Tcppriv Tcppriv;
383
struct Tcppriv
384
{
385
	/* List of active timers */
386
	QLock 	tl;
387
	Tcptimer *timers;
388
 
389
	/* hash table for matching conversations */
390
	Ipht	ht;
391
 
392
	/* calls in limbo waiting for an ACK to our SYN ACK */
393
	int	nlimbo;
394
	Limbo	*lht[NLHT];
395
 
396
	/* for keeping track of tcpackproc */
397
	QLock	apl;
398
	int	ackprocstarted;
399
 
400
	uvlong	stats[Nstats];
401
};
402
 
403
/*
404
 *  Setting tcpporthogdefense to non-zero enables Dong Lin's
405
 *  solution to hijacked systems staking out port's as a form
406
 *  of DoS attack.
407
 *
408
 *  To avoid stateless Conv hogs, we pick a sequence number at random.  If
409
 *  that number gets acked by the other end, we shut down the connection.
410
 *  Look for tcpporthogdefense in the code.
411
 */
412
int tcpporthogdefense = 0;
413
 
414
static	int	addreseq(Fs*, Tcpctl*, Tcppriv*, Tcp*, Block*, ushort);
415
static	int	dumpreseq(Tcpctl*);
416
static	void	getreseq(Tcpctl*, Tcp*, Block**, ushort*);
417
static	void	limbo(Conv*, uchar*, uchar*, Tcp*, int);
418
static	void	limborexmit(Proto*);
419
static	void	localclose(Conv*, char*);
420
static	void	procsyn(Conv*, Tcp*);
421
static	void	tcpacktimer(void*);
422
static	void	tcpiput(Proto*, Ipifc*, Block*);
423
static	void	tcpkeepalive(void*);
424
static	void	tcpoutput(Conv*);
425
static	void	tcprcvwin(Conv*);
426
static	void	tcprxmit(Conv*);
427
static	void	tcpsetkacounter(Tcpctl*);
428
static	void	tcpsetscale(Conv*, Tcpctl*, ushort, ushort);
429
static	void	tcpsettimer(Tcpctl*);
430
static	void	tcpsndsyn(Conv*, Tcpctl*);
431
static	void	tcpstart(Conv*, int);
432
static	void	tcpsynackrtt(Conv*);
433
static	void	tcptimeout(void*);
434
static	int	tcptrim(Tcpctl*, Tcp*, Block**, ushort*);
435
 
436
static void
437
tcpsetstate(Conv *s, uchar newstate)
438
{
439
	Tcpctl *tcb;
440
	uchar oldstate;
441
	Tcppriv *tpriv;
442
 
443
	tpriv = s->p->priv;
444
 
445
	tcb = (Tcpctl*)s->ptcl;
446
 
447
	oldstate = tcb->state;
448
	if(oldstate == newstate)
449
		return;
450
 
451
	if(oldstate == Established)
452
		tpriv->stats[CurrEstab]--;
453
	if(newstate == Established)
454
		tpriv->stats[CurrEstab]++;
455
 
456
	switch(newstate) {
457
	case Closed:
458
		qclose(s->rq);
459
		qclose(s->wq);
460
		qclose(s->eq);
461
		break;
462
 
463
	case Close_wait:		/* Remote closes */
464
		qhangup(s->rq, nil);
465
		break;
466
	}
467
 
468
	tcb->state = newstate;
469
 
470
	if(oldstate == Syn_sent && newstate != Closed)
471
		Fsconnected(s, nil);
472
}
473
 
474
static char*
475
tcpconnect(Conv *c, char **argv, int argc)
476
{
477
	char *e;
478
	Tcpctl *tcb;
479
 
480
	tcb = (Tcpctl*)(c->ptcl);
481
	if(tcb->state != Closed)
482
		return Econinuse;
483
 
484
	e = Fsstdconnect(c, argv, argc);
485
	if(e != nil)
486
		return e;
487
	tcpstart(c, TCP_CONNECT);
488
 
489
	return nil;
490
}
491
 
492
static int
493
tcpstate(Conv *c, char *state, int n)
494
{
495
	Tcpctl *s;
496
 
497
	s = (Tcpctl*)(c->ptcl);
498
 
499
	return snprint(state, n,
500
		"%s qin %d qout %d rq %d.%d srtt %d mdev %d sst %lud cwin %lud "
501
		"swin %lud>>%d rwin %lud>>%d qscale %d timer.start %d "
502
		"timer.count %d rerecv %d katimer.start %d katimer.count %d\n",
503
		tcpstates[s->state],
504
		c->rq ? qlen(c->rq) : 0,
505
		c->wq ? qlen(c->wq) : 0,
506
		s->nreseq, s->reseqlen,
507
		s->srtt, s->mdev, s->ssthresh,
508
		s->cwind, s->snd.wnd, s->rcv.scale, s->rcv.wnd, s->snd.scale,
509
		s->qscale,
510
		s->timer.start, s->timer.count, s->rerecv,
511
		s->katimer.start, s->katimer.count);
512
}
513
 
514
static int
515
tcpinuse(Conv *c)
516
{
517
	Tcpctl *s;
518
 
519
	s = (Tcpctl*)(c->ptcl);
520
	return s->state != Closed;
521
}
522
 
523
static char*
524
tcpannounce(Conv *c, char **argv, int argc)
525
{
526
	char *e;
527
	Tcpctl *tcb;
528
 
529
	tcb = (Tcpctl*)(c->ptcl);
530
	if(tcb->state != Closed)
531
		return Econinuse;
532
 
533
	e = Fsstdannounce(c, argv, argc);
534
	if(e != nil)
535
		return e;
536
	tcpstart(c, TCP_LISTEN);
537
	Fsconnected(c, nil);
538
 
539
	return nil;
540
}
541
 
542
/*
543
 *  tcpclose is always called with the q locked
544
 */
545
static void
546
tcpclose(Conv *c)
547
{
548
	Tcpctl *tcb;
549
 
550
	tcb = (Tcpctl*)c->ptcl;
551
 
552
	qhangup(c->rq, nil);
553
	qhangup(c->wq, nil);
554
	qhangup(c->eq, nil);
555
	qflush(c->rq);
556
 
557
	switch(tcb->state) {
558
	case Listen:
559
		/*
560
		 *  reset any incoming calls to this listener
561
		 */
562
		Fsconnected(c, "Hangup");
563
 
564
		localclose(c, nil);
565
		break;
566
	case Closed:
567
	case Syn_sent:
568
		localclose(c, nil);
569
		break;
570
	case Syn_received:
571
	case Established:
572
		tcb->flgcnt++;
573
		tcb->snd.nxt++;
574
		tcpsetstate(c, Finwait1);
575
		tcpoutput(c);
576
		break;
577
	case Close_wait:
578
		tcb->flgcnt++;
579
		tcb->snd.nxt++;
580
		tcpsetstate(c, Last_ack);
581
		tcpoutput(c);
582
		break;
583
	}
584
}
585
 
586
static void
587
tcpkick(void *x)
588
{
589
	Conv *s = x;
590
	Tcpctl *tcb;
591
 
592
	tcb = (Tcpctl*)s->ptcl;
593
 
594
	if(waserror()){
595
		qunlock(s);
596
		nexterror();
597
	}
598
	qlock(s);
599
 
600
	switch(tcb->state) {
601
	case Syn_sent:
602
	case Syn_received:
603
	case Established:
604
	case Close_wait:
605
		/*
606
		 * Push data
607
		 */
608
		tcpoutput(s);
609
		break;
610
	default:
611
		localclose(s, "Hangup");
612
		break;
613
	}
614
 
615
	qunlock(s);
616
	poperror();
617
}
618
 
619
static int seq_lt(ulong, ulong);
620
 
621
static void
622
tcprcvwin(Conv *s)				/* Call with tcb locked */
623
{
624
	int w;
625
	Tcpctl *tcb;
626
 
627
	tcb = (Tcpctl*)s->ptcl;
628
	w = tcb->window - qlen(s->rq);
629
	if(w < 0)
630
		w = 0;
631
	/* RFC 1122 § 4.2.2.17 do not move right edge of window left */
632
	if(seq_lt(tcb->rcv.nxt + w, tcb->rcv.wptr))
633
		w = tcb->rcv.wptr - tcb->rcv.nxt;
634
	if(w != tcb->rcv.wnd)
635
	if(w>>tcb->rcv.scale == 0 || tcb->window > 4*tcb->mss && w < tcb->mss/4){
636
		tcb->rcv.blocked = 1;
637
		netlog(s->p->f, Logtcp, "tcprcvwin: window %lud qlen %d ws %ud lport %d\n",
638
			tcb->window, qlen(s->rq), tcb->rcv.scale, s->lport);
639
	}
640
	tcb->rcv.wnd = w;
641
	tcb->rcv.wptr = tcb->rcv.nxt + w;
642
}
643
 
644
static void
645
tcpacktimer(void *v)
646
{
647
	Tcpctl *tcb;
648
	Conv *s;
649
 
650
	s = v;
651
	tcb = (Tcpctl*)s->ptcl;
652
 
653
	if(waserror()){
654
		qunlock(s);
655
		nexterror();
656
	}
657
	qlock(s);
658
	if(tcb->state != Closed){
659
		tcb->flags |= FORCE;
660
		tcpoutput(s);
661
	}
662
	qunlock(s);
663
	poperror();
664
}
665
 
666
static void
667
tcpcongestion(Tcpctl *tcb)
668
{
669
	ulong inflight;
670
 
671
	inflight = tcb->snd.nxt - tcb->snd.una;
672
	if(inflight > tcb->cwind)
673
		inflight = tcb->cwind;
674
	tcb->ssthresh = inflight / 2;
675
	if(tcb->ssthresh < 2*tcb->mss)
676
		tcb->ssthresh = 2*tcb->mss;
677
}
678
 
679
enum {
680
	L	= 2,	/* aggressive slow start; legal values ∈ (1.0, 2.0) */
681
};
682
 
683
static void
684
tcpabcincr(Tcpctl *tcb, uint acked)
685
{
686
	uint limit;
687
 
688
	tcb->abcbytes += acked;
689
	if(tcb->cwind < tcb->ssthresh){
690
		/* slow start */
691
		if(tcb->snd.rto)
692
			limit = tcb->mss;
693
		else
694
			limit = L*tcb->mss;
695
		tcb->cwind += MIN(tcb->abcbytes, limit);
696
		tcb->abcbytes = 0;
697
	} else {
698
		tcb->snd.rto = 0;
699
		/* avoidance */
700
		if(tcb->abcbytes >= tcb->cwind){
701
			tcb->abcbytes -= tcb->cwind;
702
			tcb->cwind += tcb->mss;
703
		}
704
	}
705
}
706
 
707
static void
708
tcpcreate(Conv *c)
709
{
710
	c->rq = qopen(QMAX, Qcoalesce, tcpacktimer, c);
711
	c->wq = qopen(QMAX, Qkick, tcpkick, c);
712
}
713
 
714
static void
715
timerstate(Tcppriv *priv, Tcptimer *t, int newstate)
716
{
717
	if(newstate != TcptimerON){
718
		if(t->state == TcptimerON){
719
			/* unchain */
720
			if(priv->timers == t){
721
				priv->timers = t->next;
722
				if(t->prev != nil)
723
					panic("timerstate1");
724
			}
725
			if(t->next)
726
				t->next->prev = t->prev;
727
			if(t->prev)
728
				t->prev->next = t->next;
729
			t->next = t->prev = nil;
730
		}
731
	} else {
732
		if(t->state != TcptimerON){
733
			/* chain */
734
			if(t->prev != nil || t->next != nil)
735
				panic("timerstate2");
736
			t->prev = nil;
737
			t->next = priv->timers;
738
			if(t->next)
739
				t->next->prev = t;
740
			priv->timers = t;
741
		}
742
	}
743
	t->state = newstate;
744
}
745
 
746
static void
747
tcpackproc(void *a)
748
{
749
	Tcptimer *t, *tp, *timeo;
750
	Proto *tcp;
751
	Tcppriv *priv;
752
	int loop;
753
 
754
	tcp = a;
755
	priv = tcp->priv;
756
 
757
	for(;;) {
758
		tsleep(&up->sleep, return0, 0, MSPTICK);
759
 
760
		qlock(&priv->tl);
761
		timeo = nil;
762
		loop = 0;
763
		for(t = priv->timers; t != nil; t = tp) {
764
			if(loop++ > 10000)
765
				panic("tcpackproc1");
766
			tp = t->next;
767
 			if(t->state == TcptimerON) {
768
				t->count--;
769
				if(t->count == 0) {
770
					timerstate(priv, t, TcptimerDONE);
771
					t->readynext = timeo;
772
					timeo = t;
773
				}
774
			}
775
		}
776
		qunlock(&priv->tl);
777
 
778
		loop = 0;
779
		for(t = timeo; t != nil; t = t->readynext) {
780
			if(loop++ > 10000)
781
				panic("tcpackproc2");
782
			if(t->state == TcptimerDONE && t->func != nil && !waserror()){
783
				(*t->func)(t->arg);
784
				poperror();
785
			}
786
		}
787
 
788
		limborexmit(tcp);
789
	}
790
}
791
 
792
static void
793
tcpgo(Tcppriv *priv, Tcptimer *t)
794
{
795
	if(t == nil || t->start == 0)
796
		return;
797
 
798
	qlock(&priv->tl);
799
	t->count = t->start;
800
	timerstate(priv, t, TcptimerON);
801
	qunlock(&priv->tl);
802
}
803
 
804
static void
805
tcphalt(Tcppriv *priv, Tcptimer *t)
806
{
807
	if(t == nil)
808
		return;
809
 
810
	qlock(&priv->tl);
811
	timerstate(priv, t, TcptimerOFF);
812
	qunlock(&priv->tl);
813
}
814
 
815
static int
816
backoff(int n)
817
{
818
	return 1 << n;
819
}
820
 
821
static void
822
localclose(Conv *s, char *reason)	/* called with tcb locked */
823
{
824
	Tcpctl *tcb;
825
	Tcppriv *tpriv;
826
 
827
	tpriv = s->p->priv;
828
	tcb = (Tcpctl*)s->ptcl;
829
 
830
	iphtrem(&tpriv->ht, s);
831
 
832
	tcphalt(tpriv, &tcb->timer);
833
	tcphalt(tpriv, &tcb->rtt_timer);
834
	tcphalt(tpriv, &tcb->acktimer);
835
	tcphalt(tpriv, &tcb->katimer);
836
 
837
	/* Flush reassembly queue; nothing more can arrive */
838
	dumpreseq(tcb);
839
 
840
	if(tcb->state == Syn_sent)
841
		Fsconnected(s, reason);
842
	if(s->state == Announced)
843
		wakeup(&s->listenr);
844
 
845
	qhangup(s->rq, reason);
846
	qhangup(s->wq, reason);
847
 
848
	tcpsetstate(s, Closed);
849
}
850
 
851
/* mtu (- TCP + IP hdr len) of 1st hop */
852
static int
853
tcpmtu(Proto *tcp, uchar *addr, int version, uint *scale)
854
{
855
	Ipifc *ifc;
856
	int mtu;
857
 
858
	ifc = findipifc(tcp->f, addr, 0);
859
	switch(version){
860
	default:
861
	case V4:
862
		mtu = DEF_MSS;
863
		if(ifc != nil)
864
			mtu = ifc->maxtu - ifc->m->hsize - (TCP4_PKT + TCP4_HDRSIZE);
865
		break;
866
	case V6:
867
		mtu = DEF_MSS6;
868
		if(ifc != nil)
869
			mtu = ifc->maxtu - ifc->m->hsize - (TCP6_PKT + TCP6_HDRSIZE);
870
		break;
871
	}
872
	/*
873
	 * set the ws.  it doesn't commit us to anything.
874
	 * ws is the ultimate limit to the bandwidth-delay product.
875
	 */
876
	*scale = Defadvscale;
877
 
878
	return mtu;
879
}
880
 
881
static void
882
inittcpctl(Conv *s, int mode)
883
{
884
	Tcpctl *tcb;
885
	Tcp4hdr* h4;
886
	Tcp6hdr* h6;
887
	Tcppriv *tpriv;
888
	int mss;
889
 
890
	tcb = (Tcpctl*)s->ptcl;
891
 
892
	memset(tcb, 0, sizeof(Tcpctl));
893
 
894
	tcb->ssthresh = QMAX;			/* reset by tcpsetscale() */
895
	tcb->srtt = tcp_irtt<<LOGAGAIN;
896
	tcb->mdev = 0;
897
 
898
	/* setup timers */
899
	tcb->timer.start = tcp_irtt / MSPTICK;
900
	tcb->timer.func = tcptimeout;
901
	tcb->timer.arg = s;
902
	tcb->rtt_timer.start = MAX_TIME;
903
	tcb->acktimer.start = TCP_ACK / MSPTICK;
904
	tcb->acktimer.func = tcpacktimer;
905
	tcb->acktimer.arg = s;
906
	tcb->katimer.start = DEF_KAT / MSPTICK;
907
	tcb->katimer.func = tcpkeepalive;
908
	tcb->katimer.arg = s;
909
 
910
	mss = DEF_MSS;
911
 
912
	/* create a prototype(pseudo) header */
913
	if(mode != TCP_LISTEN){
914
		if(ipcmp(s->laddr, IPnoaddr) == 0)
915
			findlocalip(s->p->f, s->laddr, s->raddr);
916
 
917
		switch(s->ipversion){
918
		case V4:
919
			h4 = &tcb->protohdr.tcp4hdr;
920
			memset(h4, 0, sizeof(*h4));
921
			h4->proto = IP_TCPPROTO;
922
			hnputs(h4->tcpsport, s->lport);
923
			hnputs(h4->tcpdport, s->rport);
924
			v6tov4(h4->tcpsrc, s->laddr);
925
			v6tov4(h4->tcpdst, s->raddr);
926
			break;
927
		case V6:
928
			h6 = &tcb->protohdr.tcp6hdr;
929
			memset(h6, 0, sizeof(*h6));
930
			h6->proto = IP_TCPPROTO;
931
			hnputs(h6->tcpsport, s->lport);
932
			hnputs(h6->tcpdport, s->rport);
933
			ipmove(h6->tcpsrc, s->laddr);
934
			ipmove(h6->tcpdst, s->raddr);
935
			mss = DEF_MSS6;
936
			break;
937
		default:
938
			panic("inittcpctl: version %d", s->ipversion);
939
		}
940
	}
941
 
942
	tcb->mss = tcb->cwind = mss;
943
	tcb->abcbytes = 0;
944
	tpriv = s->p->priv;
945
	tpriv->stats[Mss] = tcb->mss;
946
 
947
	/* default is no window scaling */
948
	tcpsetscale(s, tcb, 0, 0);
949
}
950
 
951
/*
952
 *  called with s qlocked
953
 */
954
static void
955
tcpstart(Conv *s, int mode)
956
{
957
	Tcpctl *tcb;
958
	Tcppriv *tpriv;
959
	char kpname[KNAMELEN];
960
 
961
	tpriv = s->p->priv;
962
 
963
	if(tpriv->ackprocstarted == 0){
964
		qlock(&tpriv->apl);
965
		if(tpriv->ackprocstarted == 0){
966
			snprint(kpname, sizeof kpname, "#I%dtcpack", s->p->f->dev);
967
			kproc(kpname, tcpackproc, s->p);
968
			tpriv->ackprocstarted = 1;
969
		}
970
		qunlock(&tpriv->apl);
971
	}
972
 
973
	tcb = (Tcpctl*)s->ptcl;
974
 
975
	inittcpctl(s, mode);
976
 
977
	iphtadd(&tpriv->ht, s);
978
	switch(mode) {
979
	case TCP_LISTEN:
980
		tpriv->stats[PassiveOpens]++;
981
		tcb->flags |= CLONE;
982
		tcpsetstate(s, Listen);
983
		break;
984
 
985
	case TCP_CONNECT:
986
		tpriv->stats[ActiveOpens]++;
987
		tcb->flags |= ACTIVE;
988
		tcpsndsyn(s, tcb);
989
		tcpsetstate(s, Syn_sent);
990
		tcpoutput(s);
991
		break;
992
	}
993
}
994
 
995
static char*
996
tcpflag(char *buf, char *e, ushort flag)
997
{
998
	char *p;
999
 
1000
	p = seprint(buf, e, "%d", flag>>10);	/* Head len */
1001
	if(flag & URG)
1002
		p = seprint(p, e, " URG");
1003
	if(flag & ACK)
1004
		p = seprint(p, e, " ACK");
1005
	if(flag & PSH)
1006
		p = seprint(p, e, " PSH");
1007
	if(flag & RST)
1008
		p = seprint(p, e, " RST");
1009
	if(flag & SYN)
1010
		p = seprint(p, e, " SYN");
1011
	if(flag & FIN)
1012
		p = seprint(p, e, " FIN");
1013
	USED(p);
1014
	return buf;
1015
}
1016
 
1017
static Block*
1018
htontcp6(Tcp *tcph, Block *data, Tcp6hdr *ph, Tcpctl *tcb)
1019
{
1020
	int dlen;
1021
	Tcp6hdr *h;
1022
	ushort csum;
1023
	ushort hdrlen, optpad = 0;
1024
	uchar *opt;
1025
 
1026
	hdrlen = TCP6_HDRSIZE;
1027
	if(tcph->flags & SYN){
1028
		if(tcph->mss)
1029
			hdrlen += MSS_LENGTH;
1030
		if(tcph->ws)
1031
			hdrlen += WS_LENGTH;
1032
		optpad = hdrlen & 3;
1033
		if(optpad)
1034
			optpad = 4 - optpad;
1035
		hdrlen += optpad;
1036
	}
1037
 
1038
	if(data) {
1039
		dlen = blocklen(data);
1040
		data = padblock(data, hdrlen + TCP6_PKT);
1041
		if(data == nil)
1042
			return nil;
1043
	}
1044
	else {
1045
		dlen = 0;
1046
		data = allocb(hdrlen + TCP6_PKT + 64);	/* the 64 pad is to meet mintu's */
1047
		if(data == nil)
1048
			return nil;
1049
		data->wp += hdrlen + TCP6_PKT;
1050
	}
1051
 
1052
	/* copy in pseudo ip header plus port numbers */
1053
	h = (Tcp6hdr *)(data->rp);
1054
	memmove(h, ph, TCP6_TCBPHDRSZ);
1055
 
1056
	/* compose pseudo tcp header, do cksum calculation */
1057
	hnputl(h->vcf, hdrlen + dlen);
1058
	h->ploadlen[0] = h->ploadlen[1] = h->proto = 0;
1059
	h->ttl = ph->proto;
1060
 
1061
	/* copy in variable bits */
1062
	hnputl(h->tcpseq, tcph->seq);
1063
	hnputl(h->tcpack, tcph->ack);
1064
	hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1065
	hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1066
	hnputs(h->tcpurg, tcph->urg);
1067
 
1068
	if(tcph->flags & SYN){
1069
		opt = h->tcpopt;
1070
		if(tcph->mss != 0){
1071
			*opt++ = MSSOPT;
1072
			*opt++ = MSS_LENGTH;
1073
			hnputs(opt, tcph->mss);
1074
			opt += 2;
1075
		}
1076
		if(tcph->ws != 0){
1077
			*opt++ = WSOPT;
1078
			*opt++ = WS_LENGTH;
1079
			*opt++ = tcph->ws;
1080
		}
1081
		while(optpad-- > 0)
1082
			*opt++ = NOOPOPT;
1083
	}
1084
 
1085
	if(tcb != nil && tcb->nochecksum){
1086
		h->tcpcksum[0] = h->tcpcksum[1] = 0;
1087
	} else {
1088
		csum = ptclcsum(data, TCP6_IPLEN, hdrlen+dlen+TCP6_PHDRSIZE);
1089
		hnputs(h->tcpcksum, csum);
1090
	}
1091
 
1092
	/* move from pseudo header back to normal ip header */
1093
	memset(h->vcf, 0, 4);
1094
	h->vcf[0] = IP_VER6;
1095
	hnputs(h->ploadlen, hdrlen+dlen);
1096
	h->proto = ph->proto;
1097
 
1098
	return data;
1099
}
1100
 
1101
static Block*
1102
htontcp4(Tcp *tcph, Block *data, Tcp4hdr *ph, Tcpctl *tcb)
1103
{
1104
	int dlen;
1105
	Tcp4hdr *h;
1106
	ushort csum;
1107
	ushort hdrlen, optpad = 0;
1108
	uchar *opt;
1109
 
1110
	hdrlen = TCP4_HDRSIZE;
1111
	if(tcph->flags & SYN){
1112
		if(tcph->mss)
1113
			hdrlen += MSS_LENGTH;
1114
		if(1)
1115
			hdrlen += WS_LENGTH;
1116
		optpad = hdrlen & 3;
1117
		if(optpad)
1118
			optpad = 4 - optpad;
1119
		hdrlen += optpad;
1120
	}
1121
 
1122
	if(data) {
1123
		dlen = blocklen(data);
1124
		data = padblock(data, hdrlen + TCP4_PKT);
1125
		if(data == nil)
1126
			return nil;
1127
	}
1128
	else {
1129
		dlen = 0;
1130
		data = allocb(hdrlen + TCP4_PKT + 64);	/* the 64 pad is to meet mintu's */
1131
		if(data == nil)
1132
			return nil;
1133
		data->wp += hdrlen + TCP4_PKT;
1134
	}
1135
 
1136
	/* copy in pseudo ip header plus port numbers */
1137
	h = (Tcp4hdr *)(data->rp);
1138
	memmove(h, ph, TCP4_TCBPHDRSZ);
1139
 
1140
	/* copy in variable bits */
1141
	hnputs(h->tcplen, hdrlen + dlen);
1142
	hnputl(h->tcpseq, tcph->seq);
1143
	hnputl(h->tcpack, tcph->ack);
1144
	hnputs(h->tcpflag, (hdrlen<<10) | tcph->flags);
1145
	hnputs(h->tcpwin, tcph->wnd>>(tcb != nil ? tcb->snd.scale : 0));
1146
	hnputs(h->tcpurg, tcph->urg);
1147
 
1148
	if(tcph->flags & SYN){
1149
		opt = h->tcpopt;
1150
		if(tcph->mss != 0){
1151
			*opt++ = MSSOPT;
1152
			*opt++ = MSS_LENGTH;
1153
			hnputs(opt, tcph->mss);
1154
			opt += 2;
1155
		}
1156
		/* always offer.  rfc1323 §2.2 */
1157
		if(1){
1158
			*opt++ = WSOPT;
1159
			*opt++ = WS_LENGTH;
1160
			*opt++ = tcph->ws;
1161
		}
1162
		while(optpad-- > 0)
1163
			*opt++ = NOOPOPT;
1164
	}
1165
 
1166
	if(tcb != nil && tcb->nochecksum){
1167
		h->tcpcksum[0] = h->tcpcksum[1] = 0;
1168
	} else {
1169
		csum = ptclcsum(data, TCP4_IPLEN, hdrlen+dlen+TCP4_PHDRSIZE);
1170
		hnputs(h->tcpcksum, csum);
1171
	}
1172
 
1173
	return data;
1174
}
1175
 
1176
static int
1177
ntohtcp6(Tcp *tcph, Block **bpp)
1178
{
1179
	Tcp6hdr *h;
1180
	uchar *optr;
1181
	ushort hdrlen;
1182
	ushort optlen;
1183
	int n;
1184
 
1185
	*bpp = pullupblock(*bpp, TCP6_PKT+TCP6_HDRSIZE);
1186
	if(*bpp == nil)
1187
		return -1;
1188
 
1189
	h = (Tcp6hdr *)((*bpp)->rp);
1190
	tcph->source = nhgets(h->tcpsport);
1191
	tcph->dest = nhgets(h->tcpdport);
1192
	tcph->seq = nhgetl(h->tcpseq);
1193
	tcph->ack = nhgetl(h->tcpack);
1194
	hdrlen = (h->tcpflag[0]>>2) & ~3;
1195
	if(hdrlen < TCP6_HDRSIZE) {
1196
		freeblist(*bpp);
1197
		return -1;
1198
	}
1199
 
1200
	tcph->flags = h->tcpflag[1];
1201
	tcph->wnd = nhgets(h->tcpwin);
1202
	tcph->urg = nhgets(h->tcpurg);
1203
	tcph->mss = 0;
1204
	tcph->ws = 0;
1205
	tcph->update = 0;
1206
	tcph->len = nhgets(h->ploadlen) - hdrlen;
1207
 
1208
	*bpp = pullupblock(*bpp, hdrlen+TCP6_PKT);
1209
	if(*bpp == nil)
1210
		return -1;
1211
 
1212
	optr = h->tcpopt;
1213
	n = hdrlen - TCP6_HDRSIZE;
1214
	while(n > 0 && *optr != EOLOPT) {
1215
		if(*optr == NOOPOPT) {
1216
			n--;
1217
			optr++;
1218
			continue;
1219
		}
1220
		optlen = optr[1];
1221
		if(optlen < 2 || optlen > n)
1222
			break;
1223
		switch(*optr) {
1224
		case MSSOPT:
1225
			if(optlen == MSS_LENGTH)
1226
				tcph->mss = nhgets(optr+2);
1227
			break;
1228
		case WSOPT:
1229
			if(optlen == WS_LENGTH && *(optr+2) <= 14)
1230
				tcph->ws = *(optr+2);
1231
			break;
1232
		}
1233
		n -= optlen;
1234
		optr += optlen;
1235
	}
1236
	return hdrlen;
1237
}
1238
 
1239
static int
1240
ntohtcp4(Tcp *tcph, Block **bpp)
1241
{
1242
	Tcp4hdr *h;
1243
	uchar *optr;
1244
	ushort hdrlen;
1245
	ushort optlen;
1246
	int n;
1247
 
1248
	*bpp = pullupblock(*bpp, TCP4_PKT+TCP4_HDRSIZE);
1249
	if(*bpp == nil)
1250
		return -1;
1251
 
1252
	h = (Tcp4hdr *)((*bpp)->rp);
1253
	tcph->source = nhgets(h->tcpsport);
1254
	tcph->dest = nhgets(h->tcpdport);
1255
	tcph->seq = nhgetl(h->tcpseq);
1256
	tcph->ack = nhgetl(h->tcpack);
1257
 
1258
	hdrlen = (h->tcpflag[0]>>2) & ~3;
1259
	if(hdrlen < TCP4_HDRSIZE) {
1260
		freeblist(*bpp);
1261
		return -1;
1262
	}
1263
 
1264
	tcph->flags = h->tcpflag[1];
1265
	tcph->wnd = nhgets(h->tcpwin);
1266
	tcph->urg = nhgets(h->tcpurg);
1267
	tcph->mss = 0;
1268
	tcph->ws = 0;
1269
	tcph->update = 0;
1270
	tcph->len = nhgets(h->length) - (hdrlen + TCP4_PKT);
1271
 
1272
	*bpp = pullupblock(*bpp, hdrlen+TCP4_PKT);
1273
	if(*bpp == nil)
1274
		return -1;
1275
 
1276
	optr = h->tcpopt;
1277
	n = hdrlen - TCP4_HDRSIZE;
1278
	while(n > 0 && *optr != EOLOPT) {
1279
		if(*optr == NOOPOPT) {
1280
			n--;
1281
			optr++;
1282
			continue;
1283
		}
1284
		optlen = optr[1];
1285
		if(optlen < 2 || optlen > n)
1286
			break;
1287
		switch(*optr) {
1288
		case MSSOPT:
1289
			if(optlen == MSS_LENGTH)
1290
				tcph->mss = nhgets(optr+2);
1291
			break;
1292
		case WSOPT:
1293
			if(optlen == WS_LENGTH && *(optr+2) <= 14)
1294
				tcph->ws = *(optr+2);
1295
			break;
1296
		}
1297
		n -= optlen;
1298
		optr += optlen;
1299
	}
1300
	return hdrlen;
1301
}
1302
 
1303
/*
1304
 *  For outgoing calls, generate an initial sequence
1305
 *  number and put a SYN on the send queue
1306
 */
1307
static void
1308
tcpsndsyn(Conv *s, Tcpctl *tcb)
1309
{
1310
	Tcppriv *tpriv;
1311
 
1312
	tcb->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1313
	tcb->rttseq = tcb->iss;
1314
	tcb->snd.wl2 = tcb->iss;
1315
	tcb->snd.una = tcb->iss;
1316
	tcb->snd.rxt = tcb->iss;
1317
	tcb->snd.ptr = tcb->rttseq;
1318
	tcb->snd.nxt = tcb->rttseq;
1319
	tcb->flgcnt++;
1320
	tcb->flags |= FORCE;
1321
	tcb->sndsyntime = NOW;
1322
 
1323
	/* set desired mss and scale */
1324
	tcb->mss = tcpmtu(s->p, s->laddr, s->ipversion, &tcb->scale);
1325
	tpriv = s->p->priv;
1326
	tpriv->stats[Mss] = tcb->mss;
1327
}
1328
 
1329
void
1330
sndrst(Proto *tcp, uchar *source, uchar *dest, ushort length, Tcp *seg, uchar version, char *reason)
1331
{
1332
	Block *hbp;
1333
	uchar rflags;
1334
	Tcppriv *tpriv;
1335
	Tcp4hdr ph4;
1336
	Tcp6hdr ph6;
1337
 
1338
	netlog(tcp->f, Logtcp, "sndrst: %s\n", reason);
1339
 
1340
	tpriv = tcp->priv;
1341
 
1342
	if(seg->flags & RST)
1343
		return;
1344
 
1345
	/* make pseudo header */
1346
	switch(version) {
1347
	case V4:
1348
		memset(&ph4, 0, sizeof(ph4));
1349
		ph4.vihl = IP_VER4;
1350
		v6tov4(ph4.tcpsrc, dest);
1351
		v6tov4(ph4.tcpdst, source);
1352
		ph4.proto = IP_TCPPROTO;
1353
		hnputs(ph4.tcplen, TCP4_HDRSIZE);
1354
		hnputs(ph4.tcpsport, seg->dest);
1355
		hnputs(ph4.tcpdport, seg->source);
1356
		break;
1357
	case V6:
1358
		memset(&ph6, 0, sizeof(ph6));
1359
		ph6.vcf[0] = IP_VER6;
1360
		ipmove(ph6.tcpsrc, dest);
1361
		ipmove(ph6.tcpdst, source);
1362
		ph6.proto = IP_TCPPROTO;
1363
		hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1364
		hnputs(ph6.tcpsport, seg->dest);
1365
		hnputs(ph6.tcpdport, seg->source);
1366
		break;
1367
	default:
1368
		panic("sndrst: version %d", version);
1369
	}
1370
 
1371
	tpriv->stats[OutRsts]++;
1372
	rflags = RST;
1373
 
1374
	/* convince the other end that this reset is in band */
1375
	if(seg->flags & ACK) {
1376
		seg->seq = seg->ack;
1377
		seg->ack = 0;
1378
	}
1379
	else {
1380
		rflags |= ACK;
1381
		seg->ack = seg->seq;
1382
		seg->seq = 0;
1383
		if(seg->flags & SYN)
1384
			seg->ack++;
1385
		seg->ack += length;
1386
		if(seg->flags & FIN)
1387
			seg->ack++;
1388
	}
1389
	seg->flags = rflags;
1390
	seg->wnd = 0;
1391
	seg->urg = 0;
1392
	seg->mss = 0;
1393
	seg->ws = 0;
1394
	switch(version) {
1395
	case V4:
1396
		hbp = htontcp4(seg, nil, &ph4, nil);
1397
		if(hbp == nil)
1398
			return;
1399
		ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1400
		break;
1401
	case V6:
1402
		hbp = htontcp6(seg, nil, &ph6, nil);
1403
		if(hbp == nil)
1404
			return;
1405
		ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1406
		break;
1407
	default:
1408
		panic("sndrst2: version %d", version);
1409
	}
1410
}
1411
 
1412
/*
1413
 *  send a reset to the remote side and close the conversation
1414
 *  called with s qlocked
1415
 */
1416
static char*
1417
tcphangup(Conv *s)
1418
{
1419
	Tcp seg;
1420
	Tcpctl *tcb;
1421
	Block *hbp;
1422
 
1423
	tcb = (Tcpctl*)s->ptcl;
1424
	if(waserror())
1425
		return commonerror();
1426
	if(ipcmp(s->raddr, IPnoaddr) != 0) {
1427
		if(!waserror()){
1428
			memset(&seg, 0, sizeof seg);
1429
			seg.flags = RST | ACK;
1430
			seg.ack = tcb->rcv.nxt;
1431
			tcb->rcv.ackptr = seg.ack;
1432
			seg.seq = tcb->snd.ptr;
1433
			seg.wnd = 0;
1434
			seg.urg = 0;
1435
			seg.mss = 0;
1436
			seg.ws = 0;
1437
			switch(s->ipversion) {
1438
			case V4:
1439
				tcb->protohdr.tcp4hdr.vihl = IP_VER4;
1440
				hbp = htontcp4(&seg, nil, &tcb->protohdr.tcp4hdr, tcb);
1441
				ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
1442
				break;
1443
			case V6:
1444
				tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
1445
				hbp = htontcp6(&seg, nil, &tcb->protohdr.tcp6hdr, tcb);
1446
				ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
1447
				break;
1448
			default:
1449
				panic("tcphangup: version %d", s->ipversion);
1450
			}
1451
			poperror();
1452
		}
1453
	}
1454
	localclose(s, nil);
1455
	poperror();
1456
	return nil;
1457
}
1458
 
1459
/*
1460
 *  (re)send a SYN ACK
1461
 */
1462
static int
1463
sndsynack(Proto *tcp, Limbo *lp)
1464
{
1465
	Block *hbp;
1466
	Tcp4hdr ph4;
1467
	Tcp6hdr ph6;
1468
	Tcp seg;
1469
	uint scale;
1470
 
1471
	/* make pseudo header */
1472
	switch(lp->version) {
1473
	case V4:
1474
		memset(&ph4, 0, sizeof(ph4));
1475
		ph4.vihl = IP_VER4;
1476
		v6tov4(ph4.tcpsrc, lp->laddr);
1477
		v6tov4(ph4.tcpdst, lp->raddr);
1478
		ph4.proto = IP_TCPPROTO;
1479
		hnputs(ph4.tcplen, TCP4_HDRSIZE);
1480
		hnputs(ph4.tcpsport, lp->lport);
1481
		hnputs(ph4.tcpdport, lp->rport);
1482
		break;
1483
	case V6:
1484
		memset(&ph6, 0, sizeof(ph6));
1485
		ph6.vcf[0] = IP_VER6;
1486
		ipmove(ph6.tcpsrc, lp->laddr);
1487
		ipmove(ph6.tcpdst, lp->raddr);
1488
		ph6.proto = IP_TCPPROTO;
1489
		hnputs(ph6.ploadlen, TCP6_HDRSIZE);
1490
		hnputs(ph6.tcpsport, lp->lport);
1491
		hnputs(ph6.tcpdport, lp->rport);
1492
		break;
1493
	default:
1494
		panic("sndrst: version %d", lp->version);
1495
	}
1496
 
1497
	memset(&seg, 0, sizeof seg);
1498
	seg.seq = lp->iss;
1499
	seg.ack = lp->irs+1;
1500
	seg.flags = SYN|ACK;
1501
	seg.urg = 0;
1502
	seg.mss = tcpmtu(tcp, lp->laddr, lp->version, &scale);
1503
	seg.wnd = QMAX;
1504
 
1505
	/* if the other side set scale, we should too */
1506
	if(lp->rcvscale){
1507
		seg.ws = scale;
1508
		lp->sndscale = scale;
1509
	} else {
1510
		seg.ws = 0;
1511
		lp->sndscale = 0;
1512
	}
1513
 
1514
	switch(lp->version) {
1515
	case V4:
1516
		hbp = htontcp4(&seg, nil, &ph4, nil);
1517
		if(hbp == nil)
1518
			return -1;
1519
		ipoput4(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1520
		break;
1521
	case V6:
1522
		hbp = htontcp6(&seg, nil, &ph6, nil);
1523
		if(hbp == nil)
1524
			return -1;
1525
		ipoput6(tcp->f, hbp, 0, MAXTTL, DFLTTOS, nil);
1526
		break;
1527
	default:
1528
		panic("sndsnack: version %d", lp->version);
1529
	}
1530
	lp->lastsend = NOW;
1531
	return 0;
1532
}
1533
 
1534
#define hashipa(a, p) ( ( (a)[IPaddrlen-2] + (a)[IPaddrlen-1] + p )&LHTMASK )
1535
 
1536
/*
1537
 *  put a call into limbo and respond with a SYN ACK
1538
 *
1539
 *  called with proto locked
1540
 */
1541
static void
1542
limbo(Conv *s, uchar *source, uchar *dest, Tcp *seg, int version)
1543
{
1544
	Limbo *lp, **l;
1545
	Tcppriv *tpriv;
1546
	int h;
1547
 
1548
	tpriv = s->p->priv;
1549
	h = hashipa(source, seg->source);
1550
 
1551
	for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1552
		lp = *l;
1553
		if(lp->lport != seg->dest || lp->rport != seg->source || lp->version != version)
1554
			continue;
1555
		if(ipcmp(lp->raddr, source) != 0)
1556
			continue;
1557
		if(ipcmp(lp->laddr, dest) != 0)
1558
			continue;
1559
 
1560
		/* each new SYN restarts the retransmits */
1561
		lp->irs = seg->seq;
1562
		break;
1563
	}
1564
	lp = *l;
1565
	if(lp == nil){
1566
		if(tpriv->nlimbo >= Maxlimbo && tpriv->lht[h]){
1567
			lp = tpriv->lht[h];
1568
			tpriv->lht[h] = lp->next;
1569
			lp->next = nil;
1570
		} else {
1571
			lp = malloc(sizeof(*lp));
1572
			if(lp == nil)
1573
				return;
1574
			tpriv->nlimbo++;
1575
		}
1576
		*l = lp;
1577
		lp->version = version;
1578
		ipmove(lp->laddr, dest);
1579
		ipmove(lp->raddr, source);
1580
		lp->lport = seg->dest;
1581
		lp->rport = seg->source;
1582
		lp->mss = seg->mss;
1583
		lp->rcvscale = seg->ws;
1584
		lp->irs = seg->seq;
1585
		lp->iss = (nrand(1<<16)<<16)|nrand(1<<16);
1586
	}
1587
 
1588
	if(sndsynack(s->p, lp) < 0){
1589
		*l = lp->next;
1590
		tpriv->nlimbo--;
1591
		free(lp);
1592
	}
1593
}
1594
 
1595
/*
1596
 *  resend SYN ACK's once every SYNACK_RXTIMER ms.
1597
 */
1598
static void
1599
limborexmit(Proto *tcp)
1600
{
1601
	Tcppriv *tpriv;
1602
	Limbo **l, *lp;
1603
	int h;
1604
	int seen;
1605
	ulong now;
1606
 
1607
	tpriv = tcp->priv;
1608
 
1609
	if(!canqlock(tcp))
1610
		return;
1611
	seen = 0;
1612
	now = NOW;
1613
	for(h = 0; h < NLHT && seen < tpriv->nlimbo; h++){
1614
		for(l = &tpriv->lht[h]; *l != nil && seen < tpriv->nlimbo; ){
1615
			lp = *l;
1616
			seen++;
1617
			if(now - lp->lastsend < (lp->rexmits+1)*SYNACK_RXTIMER)
1618
				continue;
1619
 
1620
			/* time it out after 1 second */
1621
			if(++(lp->rexmits) > 5){
1622
				tpriv->nlimbo--;
1623
				*l = lp->next;
1624
				free(lp);
1625
				continue;
1626
			}
1627
 
1628
			/* if we're being attacked, don't bother resending SYN ACK's */
1629
			if(tpriv->nlimbo > 100)
1630
				continue;
1631
 
1632
			if(sndsynack(tcp, lp) < 0){
1633
				tpriv->nlimbo--;
1634
				*l = lp->next;
1635
				free(lp);
1636
				continue;
1637
			}
1638
 
1639
			l = &lp->next;
1640
		}
1641
	}
1642
	qunlock(tcp);
1643
}
1644
 
1645
/*
1646
 *  lookup call in limbo.  if found, throw it out.
1647
 *
1648
 *  called with proto locked
1649
 */
1650
static void
1651
limborst(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1652
{
1653
	Limbo *lp, **l;
1654
	int h;
1655
	Tcppriv *tpriv;
1656
 
1657
	tpriv = s->p->priv;
1658
 
1659
	/* find a call in limbo */
1660
	h = hashipa(src, segp->source);
1661
	for(l = &tpriv->lht[h]; *l != nil; l = &lp->next){
1662
		lp = *l;
1663
		if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1664
			continue;
1665
		if(ipcmp(lp->laddr, dst) != 0)
1666
			continue;
1667
		if(ipcmp(lp->raddr, src) != 0)
1668
			continue;
1669
 
1670
		/* RST can only follow the SYN */
1671
		if(segp->seq == lp->irs+1){
1672
			tpriv->nlimbo--;
1673
			*l = lp->next;
1674
			free(lp);
1675
		}
1676
		break;
1677
	}
1678
}
1679
 
1680
static void
1681
initialwindow(Tcpctl *tcb)
1682
{
1683
	/* RFC 3390 initial window */
1684
	if(tcb->mss < 1095)
1685
		tcb->cwind = 4*tcb->mss;
1686
	else if(tcb->mss < 2190)
1687
		tcb->cwind = 2*2190;
1688
	else
1689
		tcb->cwind = 2*tcb->mss;
1690
}
1691
 
1692
/*
1693
 *  come here when we finally get an ACK to our SYN-ACK.
1694
 *  lookup call in limbo.  if found, create a new conversation
1695
 *
1696
 *  called with proto locked
1697
 */
1698
static Conv*
1699
tcpincoming(Conv *s, Tcp *segp, uchar *src, uchar *dst, uchar version)
1700
{
1701
	Conv *new;
1702
	Tcpctl *tcb;
1703
	Tcppriv *tpriv;
1704
	Tcp4hdr *h4;
1705
	Tcp6hdr *h6;
1706
	Limbo *lp, **l;
1707
	int h;
1708
 
1709
	/* unless it's just an ack, it can't be someone coming out of limbo */
1710
	if((segp->flags & SYN) || (segp->flags & ACK) == 0)
1711
		return nil;
1712
 
1713
	tpriv = s->p->priv;
1714
 
1715
	/* find a call in limbo */
1716
	h = hashipa(src, segp->source);
1717
	for(l = &tpriv->lht[h]; (lp = *l) != nil; l = &lp->next){
1718
		netlog(s->p->f, Logtcp, "tcpincoming s %I!%ud/%I!%ud d %I!%ud/%I!%ud v %d/%d\n",
1719
			src, segp->source, lp->raddr, lp->rport,
1720
			dst, segp->dest, lp->laddr, lp->lport,
1721
			version, lp->version
1722
 		);
1723
 
1724
		if(lp->lport != segp->dest || lp->rport != segp->source || lp->version != version)
1725
			continue;
1726
		if(ipcmp(lp->laddr, dst) != 0)
1727
			continue;
1728
		if(ipcmp(lp->raddr, src) != 0)
1729
			continue;
1730
 
1731
		/* we're assuming no data with the initial SYN */
1732
		if(segp->seq != lp->irs+1 || segp->ack != lp->iss+1){
1733
			netlog(s->p->f, Logtcp, "tcpincoming s %lux/%lux a %lux %lux\n",
1734
				segp->seq, lp->irs+1, segp->ack, lp->iss+1);
1735
			lp = nil;
1736
		} else {
1737
			tpriv->nlimbo--;
1738
			*l = lp->next;
1739
		}
1740
		break;
1741
	}
1742
	if(lp == nil)
1743
		return nil;
1744
 
1745
	new = Fsnewcall(s, src, segp->source, dst, segp->dest, version);
1746
	if(new == nil)
1747
		return nil;
1748
 
1749
	memmove(new->ptcl, s->ptcl, sizeof(Tcpctl));
1750
	tcb = (Tcpctl*)new->ptcl;
1751
	tcb->flags &= ~CLONE;
1752
	tcb->timer.arg = new;
1753
	tcb->timer.state = TcptimerOFF;
1754
	tcb->acktimer.arg = new;
1755
	tcb->acktimer.state = TcptimerOFF;
1756
	tcb->katimer.arg = new;
1757
	tcb->katimer.state = TcptimerOFF;
1758
	tcb->rtt_timer.arg = new;
1759
	tcb->rtt_timer.state = TcptimerOFF;
1760
 
1761
	tcb->irs = lp->irs;
1762
	tcb->rcv.nxt = tcb->irs+1;
1763
	tcb->rcv.wptr = tcb->rcv.nxt;
1764
	tcb->rcv.wsnt = 0;
1765
	tcb->rcv.urg = tcb->rcv.nxt;
1766
 
1767
	tcb->iss = lp->iss;
1768
	tcb->rttseq = tcb->iss;
1769
	tcb->snd.wl2 = tcb->iss;
1770
	tcb->snd.una = tcb->iss+1;
1771
	tcb->snd.ptr = tcb->iss+1;
1772
	tcb->snd.nxt = tcb->iss+1;
1773
	tcb->snd.rxt = tcb->iss+1;
1774
	tcb->flgcnt = 0;
1775
	tcb->flags |= SYNACK;
1776
 
1777
	/* our sending max segment size cannot be bigger than what he asked for */
1778
	if(lp->mss != 0 && lp->mss < tcb->mss) {
1779
		tcb->mss = lp->mss;
1780
		tpriv->stats[Mss] = tcb->mss;
1781
	}
1782
 
1783
	/* window scaling */
1784
	tcpsetscale(new, tcb, lp->rcvscale, lp->sndscale);
1785
 
1786
	/* congestion window */
1787
	tcb->snd.wnd = segp->wnd;
1788
	initialwindow(tcb);
1789
 
1790
	/* set initial round trip time */
1791
	tcb->sndsyntime = lp->lastsend+lp->rexmits*SYNACK_RXTIMER;
1792
	tcpsynackrtt(new);
1793
 
1794
	free(lp);
1795
 
1796
	/* set up proto header */
1797
	switch(version){
1798
	case V4:
1799
		h4 = &tcb->protohdr.tcp4hdr;
1800
		memset(h4, 0, sizeof(*h4));
1801
		h4->proto = IP_TCPPROTO;
1802
		hnputs(h4->tcpsport, new->lport);
1803
		hnputs(h4->tcpdport, new->rport);
1804
		v6tov4(h4->tcpsrc, dst);
1805
		v6tov4(h4->tcpdst, src);
1806
		break;
1807
	case V6:
1808
		h6 = &tcb->protohdr.tcp6hdr;
1809
		memset(h6, 0, sizeof(*h6));
1810
		h6->proto = IP_TCPPROTO;
1811
		hnputs(h6->tcpsport, new->lport);
1812
		hnputs(h6->tcpdport, new->rport);
1813
		ipmove(h6->tcpsrc, dst);
1814
		ipmove(h6->tcpdst, src);
1815
		break;
1816
	default:
1817
		panic("tcpincoming: version %d", new->ipversion);
1818
	}
1819
 
1820
	tcpsetstate(new, Established);
1821
 
1822
	iphtadd(&tpriv->ht, new);
1823
 
1824
	return new;
1825
}
1826
 
1827
static int
1828
seq_within(ulong x, ulong low, ulong high)
1829
{
1830
	if(low <= high){
1831
		if(low <= x && x <= high)
1832
			return 1;
1833
	}
1834
	else {
1835
		if(x >= low || x <= high)
1836
			return 1;
1837
	}
1838
	return 0;
1839
}
1840
 
1841
static int
1842
seq_lt(ulong x, ulong y)
1843
{
1844
	return (int)(x-y) < 0;
1845
}
1846
 
1847
static int
1848
seq_le(ulong x, ulong y)
1849
{
1850
	return (int)(x-y) <= 0;
1851
}
1852
 
1853
static int
1854
seq_gt(ulong x, ulong y)
1855
{
1856
	return (int)(x-y) > 0;
1857
}
1858
 
1859
static int
1860
seq_ge(ulong x, ulong y)
1861
{
1862
	return (int)(x-y) >= 0;
1863
}
1864
 
1865
/*
1866
 *  use the time between the first SYN and it's ack as the
1867
 *  initial round trip time
1868
 */
1869
static void
1870
tcpsynackrtt(Conv *s)
1871
{
1872
	Tcpctl *tcb;
1873
	int delta;
1874
	Tcppriv *tpriv;
1875
 
1876
	tcb = (Tcpctl*)s->ptcl;
1877
	tpriv = s->p->priv;
1878
 
1879
	delta = NOW - tcb->sndsyntime;
1880
	tcb->srtt = delta<<LOGAGAIN;
1881
	tcb->mdev = delta<<LOGDGAIN;
1882
 
1883
	/* halt round trip timer */
1884
	tcphalt(tpriv, &tcb->rtt_timer);
1885
}
1886
 
1887
static void
1888
update(Conv *s, Tcp *seg)
1889
{
1890
	int rtt, delta;
1891
	Tcpctl *tcb;
1892
	ulong acked;
1893
	Tcppriv *tpriv;
1894
 
1895
	if(seg->update)
1896
		return;
1897
	seg->update = 1;
1898
 
1899
	tpriv = s->p->priv;
1900
	tcb = (Tcpctl*)s->ptcl;
1901
 
1902
	/* catch zero-window updates, update window & recover */
1903
	if(tcb->snd.wnd == 0 && seg->wnd > 0 &&
1904
	    seq_lt(seg->ack, tcb->snd.ptr)){
1905
		netlog(s->p->f, Logtcp, "tcp: zwu ack %lud una %lud ptr %lud win %lud\n",
1906
			seg->ack,  tcb->snd.una, tcb->snd.ptr, seg->wnd);
1907
		tcb->snd.wnd = seg->wnd;
1908
		goto recovery;
1909
	}
1910
 
1911
	/* newreno fast retransmit */
1912
	if(seg->ack == tcb->snd.una && tcb->snd.una != tcb->snd.nxt &&
1913
	    ++tcb->snd.dupacks == 3){		/* was TCPREXMTTHRESH */
1914
recovery:
1915
		if(tcb->snd.recovery){
1916
			tpriv->stats[RecoveryCwind]++;
1917
			tcb->cwind += tcb->mss;
1918
		}else if(seq_le(tcb->snd.rxt, seg->ack)){
1919
			tpriv->stats[Recovery]++;
1920
			tcb->abcbytes = 0;
1921
			tcb->snd.recovery = 1;
1922
			tcb->snd.partialack = 0;
1923
			tcb->snd.rxt = tcb->snd.nxt;
1924
			tcpcongestion(tcb);
1925
			tcb->cwind = tcb->ssthresh + 3*tcb->mss;
1926
			netlog(s->p->f, Logtcpwin, "recovery inflate %ld ss %ld @%lud\n",
1927
				tcb->cwind, tcb->ssthresh, tcb->snd.rxt);
1928
			tcprxmit(s);
1929
		}else{
1930
			tpriv->stats[RecoveryNoSeq]++;
1931
			netlog(s->p->f, Logtcpwin, "!recov %lud not ≤ %lud %ld\n",
1932
				tcb->snd.rxt, seg->ack, tcb->snd.rxt - seg->ack);
1933
			/* don't enter fast retransmit, don't change ssthresh */
1934
		}
1935
	}else if(tcb->snd.recovery){
1936
		tpriv->stats[RecoveryCwind]++;
1937
		tcb->cwind += tcb->mss;
1938
	}
1939
 
1940
	/*
1941
	 *  update window
1942
	 */
1943
	if(seq_gt(seg->ack, tcb->snd.wl2)
1944
	|| (tcb->snd.wl2 == seg->ack && seg->wnd > tcb->snd.wnd)){
1945
		/* clear dupack if we advance wl2 */
1946
		if(tcb->snd.wl2 != seg->ack)
1947
			tcb->snd.dupacks = 0;
1948
		tcb->snd.wnd = seg->wnd;
1949
		tcb->snd.wl2 = seg->ack;
1950
	}
1951
 
1952
	if(!seq_gt(seg->ack, tcb->snd.una)){
1953
		/*
1954
		 *  don't let us hangup if sending into a closed window and
1955
		 *  we're still getting acks
1956
		 */
1957
		if((tcb->flags&RETRAN) && tcb->snd.wnd == 0)
1958
			tcb->backedoff = MAXBACKMS/4;
1959
		return;
1960
	}
1961
 
1962
	/* Compute the new send window size */
1963
	acked = seg->ack - tcb->snd.una;
1964
 
1965
	/* avoid slow start and timers for SYN acks */
1966
	if((tcb->flags & SYNACK) == 0) {
1967
		tcb->flags |= SYNACK;
1968
		acked--;
1969
		tcb->flgcnt--;
1970
		goto done;
1971
	}
1972
 
1973
	/*
1974
	 * congestion control
1975
	 */
1976
	if(tcb->snd.recovery){
1977
		if(seq_ge(seg->ack, tcb->snd.rxt)){
1978
			/* recovery finished; deflate window */
1979
			tpriv->stats[RecoveryDone]++;
1980
			tcb->snd.dupacks = 0;
1981
			tcb->snd.recovery = 0;
1982
			tcb->cwind = (tcb->snd.nxt - tcb->snd.una) + tcb->mss;
1983
			if(tcb->ssthresh < tcb->cwind)
1984
				tcb->cwind = tcb->ssthresh;
1985
			netlog(s->p->f, Logtcpwin, "recovery deflate %ld %ld\n",
1986
				tcb->cwind, tcb->ssthresh);
1987
		} else {
1988
			/* partial ack; we lost more than one segment */
1989
			tpriv->stats[RecoveryPA]++;
1990
			if(tcb->cwind > acked)
1991
				tcb->cwind -= acked;
1992
			else{
1993
				netlog(s->p->f, Logtcpwin, "partial ack neg\n");
1994
				tcb->cwind = tcb->mss;
1995
			}
1996
			netlog(s->p->f, Logtcpwin, "partial ack %ld left %ld cwind %ld\n",
1997
				acked, tcb->snd.rxt - seg->ack, tcb->cwind);
1998
 
1999
			if(acked >= tcb->mss)
2000
				tcb->cwind += tcb->mss;
2001
			tcb->snd.partialack++;
2002
		}
2003
	} else
2004
		tcpabcincr(tcb, acked);
2005
 
2006
	/* Adjust the timers according to the round trip time */
2007
	/* TODO: fix sloppy treatment of overflow cases here. */
2008
	if(tcb->rtt_timer.state == TcptimerON && seq_ge(seg->ack, tcb->rttseq)) {
2009
		tcphalt(tpriv, &tcb->rtt_timer);
2010
		if((tcb->flags&RETRAN) == 0) {
2011
			tcb->backoff = 0;
2012
			tcb->backedoff = 0;
2013
			rtt = tcb->rtt_timer.start - tcb->rtt_timer.count;
2014
			if(rtt == 0)
2015
				rtt = 1; /* else all close sys's will rexmit in 0 time */
2016
			rtt *= MSPTICK;
2017
			if(tcb->srtt == 0) {
2018
				tcb->srtt = rtt << LOGAGAIN;
2019
				tcb->mdev = rtt << LOGDGAIN;
2020
			} else {
2021
				delta = rtt - (tcb->srtt>>LOGAGAIN);
2022
				tcb->srtt += delta;
2023
				if(tcb->srtt <= 0)
2024
					tcb->srtt = 1;
2025
 
2026
				delta = abs(delta) - (tcb->mdev>>LOGDGAIN);
2027
				tcb->mdev += delta;
2028
				if(tcb->mdev <= 0)
2029
					tcb->mdev = 1;
2030
			}
2031
			tcpsettimer(tcb);
2032
		}
2033
	}
2034
 
2035
done:
2036
	if(qdiscard(s->wq, acked) < acked)
2037
		tcb->flgcnt--;
2038
	tcb->snd.una = seg->ack;
2039
 
2040
	/* newreno fast recovery */
2041
	if(tcb->snd.recovery)
2042
		tcprxmit(s);
2043
 
2044
	if(seq_gt(seg->ack, tcb->snd.urg))
2045
		tcb->snd.urg = seg->ack;
2046
 
2047
	if(tcb->snd.una != tcb->snd.nxt){
2048
		/* `impatient' variant */
2049
		if(!tcb->snd.recovery || tcb->snd.partialack == 1){
2050
			tcb->time = NOW;
2051
			tcb->timeuna = tcb->snd.una;
2052
			tcpgo(tpriv, &tcb->timer);
2053
		}
2054
	} else
2055
		tcphalt(tpriv, &tcb->timer);
2056
 
2057
	if(seq_lt(tcb->snd.ptr, tcb->snd.una))
2058
		tcb->snd.ptr = tcb->snd.una;
2059
 
2060
	if(!tcb->snd.recovery)
2061
		tcb->flags &= ~RETRAN;
2062
	tcb->backoff = 0;
2063
	tcb->backedoff = 0;
2064
}
2065
 
2066
static void
2067
tcpiput(Proto *tcp, Ipifc*, Block *bp)
2068
{
2069
	Tcp seg;
2070
	Tcp4hdr *h4;
2071
	Tcp6hdr *h6;
2072
	int hdrlen;
2073
	Tcpctl *tcb;
2074
	ushort length, csum;
2075
	uchar source[IPaddrlen], dest[IPaddrlen];
2076
	Conv *s;
2077
	Fs *f;
2078
	Tcppriv *tpriv;
2079
	uchar version;
2080
 
2081
	f = tcp->f;
2082
	tpriv = tcp->priv;
2083
 
2084
	tpriv->stats[InSegs]++;
2085
 
2086
	h4 = (Tcp4hdr*)(bp->rp);
2087
	h6 = (Tcp6hdr*)(bp->rp);
2088
 
2089
	if((h4->vihl&0xF0)==IP_VER4) {
2090
		version = V4;
2091
		length = nhgets(h4->length);
2092
		v4tov6(dest, h4->tcpdst);
2093
		v4tov6(source, h4->tcpsrc);
2094
 
2095
		h4->Unused = 0;
2096
		hnputs(h4->tcplen, length-TCP4_PKT);
2097
		if(!(bp->flag & Btcpck) && (h4->tcpcksum[0] || h4->tcpcksum[1]) &&
2098
			ptclcsum(bp, TCP4_IPLEN, length-TCP4_IPLEN)) {
2099
			tpriv->stats[CsumErrs]++;
2100
			tpriv->stats[InErrs]++;
2101
			netlog(f, Logtcp, "bad tcp proto cksum\n");
2102
			freeblist(bp);
2103
			return;
2104
		}
2105
 
2106
		hdrlen = ntohtcp4(&seg, &bp);
2107
		if(hdrlen < 0){
2108
			tpriv->stats[HlenErrs]++;
2109
			tpriv->stats[InErrs]++;
2110
			netlog(f, Logtcp, "bad tcp hdr len\n");
2111
			return;
2112
		}
2113
 
2114
		/* trim the packet to the size claimed by the datagram */
2115
		length -= hdrlen+TCP4_PKT;
2116
		bp = trimblock(bp, hdrlen+TCP4_PKT, length);
2117
		if(bp == nil){
2118
			tpriv->stats[LenErrs]++;
2119
			tpriv->stats[InErrs]++;
2120
			netlog(f, Logtcp, "tcp len < 0 after trim\n");
2121
			return;
2122
		}
2123
	}
2124
	else {
2125
		int ttl = h6->ttl;
2126
		int proto = h6->proto;
2127
 
2128
		version = V6;
2129
		length = nhgets(h6->ploadlen);
2130
		ipmove(dest, h6->tcpdst);
2131
		ipmove(source, h6->tcpsrc);
2132
 
2133
		h6->ploadlen[0] = h6->ploadlen[1] = h6->proto = 0;
2134
		h6->ttl = proto;
2135
		hnputl(h6->vcf, length);
2136
		if((h6->tcpcksum[0] || h6->tcpcksum[1]) &&
2137
		    (csum = ptclcsum(bp, TCP6_IPLEN, length+TCP6_PHDRSIZE)) != 0) {
2138
			tpriv->stats[CsumErrs]++;
2139
			tpriv->stats[InErrs]++;
2140
			netlog(f, Logtcp,
2141
			    "bad tcpv6 proto cksum: got %#ux, computed %#ux\n",
2142
				h6->tcpcksum[0]<<8 | h6->tcpcksum[1], csum);
2143
			freeblist(bp);
2144
			return;
2145
		}
2146
		h6->ttl = ttl;
2147
		h6->proto = proto;
2148
		hnputs(h6->ploadlen, length);
2149
 
2150
		hdrlen = ntohtcp6(&seg, &bp);
2151
		if(hdrlen < 0){
2152
			tpriv->stats[HlenErrs]++;
2153
			tpriv->stats[InErrs]++;
2154
			netlog(f, Logtcp, "bad tcpv6 hdr len\n");
2155
			return;
2156
		}
2157
 
2158
		/* trim the packet to the size claimed by the datagram */
2159
		length -= hdrlen;
2160
		bp = trimblock(bp, hdrlen+TCP6_PKT, length);
2161
		if(bp == nil){
2162
			tpriv->stats[LenErrs]++;
2163
			tpriv->stats[InErrs]++;
2164
			netlog(f, Logtcp, "tcpv6 len < 0 after trim\n");
2165
			return;
2166
		}
2167
	}
2168
 
2169
	/* lock protocol while searching for a conversation */
2170
	qlock(tcp);
2171
 
2172
	/* Look for a matching conversation */
2173
	s = iphtlook(&tpriv->ht, source, seg.source, dest, seg.dest);
2174
	if(s == nil){
2175
		netlog(f, Logtcp, "iphtlook(src %I!%d, dst %I!%d) failed\n",
2176
			source, seg.source, dest, seg.dest);
2177
reset:
2178
		qunlock(tcp);
2179
		sndrst(tcp, source, dest, length, &seg, version, "no conversation");
2180
		freeblist(bp);
2181
		return;
2182
	}
2183
 
2184
	/* if it's a listener, look for the right flags and get a new conv */
2185
	tcb = (Tcpctl*)s->ptcl;
2186
	if(tcb->state == Listen){
2187
		if(seg.flags & RST){
2188
			limborst(s, &seg, source, dest, version);
2189
			qunlock(tcp);
2190
			freeblist(bp);
2191
			return;
2192
		}
2193
 
2194
		/* if this is a new SYN, put the call into limbo */
2195
		if((seg.flags & SYN) && (seg.flags & ACK) == 0){
2196
			limbo(s, source, dest, &seg, version);
2197
			qunlock(tcp);
2198
			freeblist(bp);
2199
			return;
2200
		}
2201
 
2202
		/*
2203
		 *  if there's a matching call in limbo, tcpincoming will
2204
		 *  return it in state Syn_received
2205
		 */
2206
		s = tcpincoming(s, &seg, source, dest, version);
2207
		if(s == nil)
2208
			goto reset;
2209
	}
2210
 
2211
	/* The rest of the input state machine is run with the control block
2212
	 * locked and implements the state machine directly out of the RFC.
2213
	 * Out-of-band data is ignored - it was always a bad idea.
2214
	 */
2215
	tcb = (Tcpctl*)s->ptcl;
2216
	if(waserror()){
2217
		qunlock(s);
2218
		nexterror();
2219
	}
2220
	qlock(s);
2221
	qunlock(tcp);
2222
 
2223
	/* fix up window */
2224
	seg.wnd <<= tcb->rcv.scale;
2225
 
2226
	/* every input packet in puts off the keep alive time out */
2227
	tcpsetkacounter(tcb);
2228
 
2229
	switch(tcb->state) {
2230
	case Closed:
2231
		sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2232
		goto raise;
2233
	case Syn_sent:
2234
		if(seg.flags & ACK) {
2235
			if(!seq_within(seg.ack, tcb->iss+1, tcb->snd.nxt)) {
2236
				sndrst(tcp, source, dest, length, &seg, version,
2237
					 "bad seq in Syn_sent");
2238
				goto raise;
2239
			}
2240
		}
2241
		if(seg.flags & RST) {
2242
			if(seg.flags & ACK)
2243
				localclose(s, Econrefused);
2244
			goto raise;
2245
		}
2246
 
2247
		if(seg.flags & SYN) {
2248
			procsyn(s, &seg);
2249
			if(seg.flags & ACK){
2250
				update(s, &seg);
2251
				tcpsynackrtt(s);
2252
				tcpsetstate(s, Established);
2253
				tcpsetscale(s, tcb, seg.ws, tcb->scale);
2254
			}
2255
			else {
2256
				tcb->time = NOW;
2257
				tcpsetstate(s, Syn_received);	/* DLP - shouldn't this be a reset? */
2258
			}
2259
 
2260
			if(length != 0 || (seg.flags & FIN))
2261
				break;
2262
 
2263
			freeblist(bp);
2264
			goto output;
2265
		}
2266
		else
2267
			freeblist(bp);
2268
 
2269
		qunlock(s);
2270
		poperror();
2271
		return;
2272
	case Syn_received:
2273
		/* doesn't matter if it's the correct ack, we're just trying to set timing */
2274
		if(seg.flags & ACK)
2275
			tcpsynackrtt(s);
2276
		break;
2277
	}
2278
 
2279
	/*
2280
	 *  One DOS attack is to open connections to us and then forget about them,
2281
	 *  thereby tying up a conv at no long term cost to the attacker.
2282
	 *  This is an attempt to defeat these stateless DOS attacks.  See
2283
	 *  corresponding code in tcpsendka().
2284
	 */
2285
	if(tcb->state != Syn_received && (seg.flags & RST) == 0){
2286
		if(tcpporthogdefense
2287
		&& seq_within(seg.ack, tcb->snd.una-(1<<31), tcb->snd.una-(1<<29))){
2288
			print("stateless hog %I.%d->%I.%d f %ux %lux - %lux - %lux\n",
2289
				source, seg.source, dest, seg.dest, seg.flags,
2290
				tcb->snd.una-(1<<31), seg.ack, tcb->snd.una-(1<<29));
2291
			localclose(s, "stateless hog");
2292
		}
2293
	}
2294
 
2295
	/* Cut the data to fit the receive window */
2296
	tcprcvwin(s);
2297
	if(tcptrim(tcb, &seg, &bp, &length) == -1) {
2298
		if(seg.seq+1 != tcb->rcv.nxt || length != 1)
2299
		netlog(f, Logtcp, "tcp: trim: !inwind: seq %lud-%lud win "
2300
			"%lud-%lud l %d from %I\n", seg.seq,
2301
			seg.seq + length - 1, tcb->rcv.nxt,
2302
			tcb->rcv.nxt + tcb->rcv.wnd-1, length, s->raddr);
2303
		update(s, &seg);
2304
		if(qlen(s->wq)+tcb->flgcnt == 0 && tcb->state == Closing) {
2305
			tcphalt(tpriv, &tcb->rtt_timer);
2306
			tcphalt(tpriv, &tcb->acktimer);
2307
			tcphalt(tpriv, &tcb->katimer);
2308
			tcpsetstate(s, Time_wait);
2309
			tcb->timer.start = MSL2*(1000 / MSPTICK);
2310
			tcpgo(tpriv, &tcb->timer);
2311
		}
2312
		if(!(seg.flags & RST)) {
2313
			tcb->flags |= FORCE;
2314
			goto output;
2315
		}
2316
		qunlock(s);
2317
		poperror();
2318
		return;
2319
	}
2320
 
2321
	/* Cannot accept so answer with a rst */
2322
	if(length && tcb->state == Closed) {
2323
		sndrst(tcp, source, dest, length, &seg, version, "sending to Closed");
2324
		goto raise;
2325
	}
2326
 
2327
	/* The segment is beyond the current receive pointer so
2328
	 * queue the data in the resequence queue
2329
	 */
2330
	if(seg.seq != tcb->rcv.nxt)
2331
	if(length != 0 || (seg.flags & (SYN|FIN))) {
2332
		update(s, &seg);
2333
		if(addreseq(f, tcb, tpriv, &seg, bp, length) < 0)
2334
			print("reseq %I.%d -> %I.%d\n", s->raddr, s->rport,
2335
				s->laddr, s->lport);
2336
		tcb->flags |= FORCE;	/* force duplicate ack; RFC 5681 §3.2 */
2337
		goto output;
2338
	}
2339
 
2340
	if(tcb->nreseq > 0)
2341
		tcb->flags |= FORCE; /* filled hole in seq. space; RFC 5681 §3.2 */
2342
 
2343
	/*
2344
	 *  keep looping till we've processed this packet plus any
2345
	 *  adjacent packets in the resequence queue
2346
	 */
2347
	for(;;) {
2348
		if(seg.flags & RST) {
2349
			if(tcb->state == Established) {
2350
				tpriv->stats[EstabResets]++;
2351
				if(tcb->rcv.nxt != seg.seq)
2352
					netlog(f, Logtcp, "out of order RST "
2353
						"rcvd: %I.%d -> %I.%d, rcv.nxt "
2354
						"%lux seq %lux\n",
2355
						s->raddr, s->rport, s->laddr,
2356
						s->lport, tcb->rcv.nxt, seg.seq);
2357
			}
2358
			localclose(s, Econrefused);
2359
			goto raise;
2360
		}
2361
 
2362
		if((seg.flags&ACK) == 0)
2363
			goto raise;
2364
 
2365
		switch(tcb->state) {
2366
		case Syn_received:
2367
			if(!seq_within(seg.ack, tcb->snd.una+1, tcb->snd.nxt)){
2368
				sndrst(tcp, source, dest, length, &seg, version,
2369
					"bad seq in Syn_received");
2370
				goto raise;
2371
			}
2372
			update(s, &seg);
2373
			tcpsetstate(s, Established);
2374
		case Established:
2375
		case Close_wait:
2376
			update(s, &seg);
2377
			break;
2378
		case Finwait1:
2379
			update(s, &seg);
2380
			if(qlen(s->wq)+tcb->flgcnt == 0){
2381
				tcphalt(tpriv, &tcb->rtt_timer);
2382
				tcphalt(tpriv, &tcb->acktimer);
2383
				tcpsetkacounter(tcb);
2384
				tcb->time = NOW;
2385
				tcpsetstate(s, Finwait2);
2386
				tcb->katimer.start = MSL2 * (1000 / MSPTICK);
2387
				tcpgo(tpriv, &tcb->katimer);
2388
			}
2389
			break;
2390
		case Finwait2:
2391
			update(s, &seg);
2392
			break;
2393
		case Closing:
2394
			update(s, &seg);
2395
			if(qlen(s->wq)+tcb->flgcnt == 0) {
2396
				tcphalt(tpriv, &tcb->rtt_timer);
2397
				tcphalt(tpriv, &tcb->acktimer);
2398
				tcphalt(tpriv, &tcb->katimer);
2399
				tcpsetstate(s, Time_wait);
2400
				tcb->timer.start = MSL2*(1000 / MSPTICK);
2401
				tcpgo(tpriv, &tcb->timer);
2402
			}
2403
			break;
2404
		case Last_ack:
2405
			update(s, &seg);
2406
			if(qlen(s->wq)+tcb->flgcnt == 0) {
2407
				localclose(s, nil);
2408
				goto raise;
2409
			}
2410
		case Time_wait:
2411
			tcb->flags |= FORCE;
2412
			if(tcb->timer.state != TcptimerON)
2413
				tcpgo(tpriv, &tcb->timer);
2414
		}
2415
 
2416
		if((seg.flags&URG) && seg.urg) {
2417
			if(seq_gt(seg.urg + seg.seq, tcb->rcv.urg)) {
2418
				tcb->rcv.urg = seg.urg + seg.seq;
2419
				pullblock(&bp, seg.urg);
2420
			}
2421
		}
2422
		else
2423
		if(seq_gt(tcb->rcv.nxt, tcb->rcv.urg))
2424
			tcb->rcv.urg = tcb->rcv.nxt;
2425
 
2426
		if(length == 0) {
2427
			if(bp != nil)
2428
				freeblist(bp);
2429
		}
2430
		else {
2431
			switch(tcb->state){
2432
			default:
2433
				/* Ignore segment text */
2434
				if(bp != nil)
2435
					freeblist(bp);
2436
				break;
2437
 
2438
			case Syn_received:
2439
			case Established:
2440
			case Finwait1:
2441
				/* If we still have some data place on
2442
				 * receive queue
2443
				 */
2444
				if(bp) {
2445
					bp = packblock(bp);
2446
					if(bp == nil)
2447
						panic("tcp packblock");
2448
					qpassnolim(s->rq, bp);
2449
					bp = nil;
2450
				}
2451
				tcb->rcv.nxt += length;
2452
 
2453
				/*
2454
				 *  turn on the acktimer if there's something
2455
				 *  to ack
2456
				 */
2457
				if(tcb->acktimer.state != TcptimerON)
2458
					tcpgo(tpriv, &tcb->acktimer);
2459
 
2460
				break;
2461
			case Finwait2:
2462
				/* no process to read the data, send a reset */
2463
				if(bp != nil)
2464
					freeblist(bp);
2465
				sndrst(tcp, source, dest, length, &seg, version,
2466
					"send to Finwait2");
2467
				qunlock(s);
2468
				poperror();
2469
				return;
2470
			}
2471
		}
2472
 
2473
		if(seg.flags & FIN) {
2474
			tcb->flags |= FORCE;
2475
 
2476
			switch(tcb->state) {
2477
			case Syn_received:
2478
			case Established:
2479
				tcb->rcv.nxt++;
2480
				tcpsetstate(s, Close_wait);
2481
				break;
2482
			case Finwait1:
2483
				tcb->rcv.nxt++;
2484
				if(qlen(s->wq)+tcb->flgcnt == 0) {
2485
					tcphalt(tpriv, &tcb->rtt_timer);
2486
					tcphalt(tpriv, &tcb->acktimer);
2487
					tcphalt(tpriv, &tcb->katimer);
2488
					tcpsetstate(s, Time_wait);
2489
					tcb->timer.start = MSL2*(1000/MSPTICK);
2490
					tcpgo(tpriv, &tcb->timer);
2491
				}
2492
				else
2493
					tcpsetstate(s, Closing);
2494
				break;
2495
			case Finwait2:
2496
				tcb->rcv.nxt++;
2497
				tcphalt(tpriv, &tcb->rtt_timer);
2498
				tcphalt(tpriv, &tcb->acktimer);
2499
				tcphalt(tpriv, &tcb->katimer);
2500
				tcpsetstate(s, Time_wait);
2501
				tcb->timer.start = MSL2 * (1000/MSPTICK);
2502
				tcpgo(tpriv, &tcb->timer);
2503
				break;
2504
			case Close_wait:
2505
			case Closing:
2506
			case Last_ack:
2507
				break;
2508
			case Time_wait:
2509
				tcpgo(tpriv, &tcb->timer);
2510
				break;
2511
			}
2512
		}
2513
 
2514
		/*
2515
		 *  get next adjacent segment from the resequence queue.
2516
		 *  dump/trim any overlapping segments
2517
		 */
2518
		for(;;) {
2519
			if(tcb->reseq == nil)
2520
				goto output;
2521
 
2522
			if(seq_ge(tcb->rcv.nxt, tcb->reseq->seg.seq) == 0)
2523
				goto output;
2524
 
2525
			getreseq(tcb, &seg, &bp, &length);
2526
 
2527
			tcprcvwin(s);
2528
			if(tcptrim(tcb, &seg, &bp, &length) == 0){
2529
				tcb->flags |= FORCE;
2530
				break;
2531
			}
2532
		}
2533
	}
2534
output:
2535
	tcpoutput(s);
2536
	qunlock(s);
2537
	poperror();
2538
	return;
2539
raise:
2540
	qunlock(s);
2541
	poperror();
2542
	freeblist(bp);
2543
	tcpkick(s);
2544
}
2545
 
2546
/*
2547
 *  always enters and exits with the s locked.  We drop
2548
 *  the lock to ipoput the packet so some care has to be
2549
 *  taken by callers.
2550
 */
2551
static void
2552
tcpoutput(Conv *s)
2553
{
2554
	Tcp seg;
2555
	uint msgs;
2556
	Tcpctl *tcb;
2557
	Block *hbp, *bp;
2558
	int sndcnt;
2559
	ulong ssize, dsize, sent;
2560
	Fs *f;
2561
	Tcppriv *tpriv;
2562
	uchar version;
2563
 
2564
	f = s->p->f;
2565
	tpriv = s->p->priv;
2566
	version = s->ipversion;
2567
 
2568
	tcb = (Tcpctl*)s->ptcl;
2569
 
2570
	/* force ack every 2*mss */
2571
	if((tcb->flags & FORCE) == 0 &&
2572
	    tcb->rcv.nxt - tcb->rcv.ackptr >= 2*tcb->mss){
2573
		tpriv->stats[Delayack]++;
2574
		tcb->flags |= FORCE;
2575
	}
2576
 
2577
	/* force ack if window opening */
2578
	if((tcb->flags & FORCE) == 0){
2579
		tcprcvwin(s);
2580
		if((int)(tcb->rcv.wptr - tcb->rcv.wsnt) >= 2*tcb->mss){
2581
			tpriv->stats[Wopenack]++;
2582
			tcb->flags |= FORCE;
2583
		}
2584
	}
2585
 
2586
	for(msgs = 0; msgs < 100; msgs++) {
2587
		switch(tcb->state) {
2588
		case Listen:
2589
		case Closed:
2590
		case Finwait2:
2591
			return;
2592
		}
2593
 
2594
		/* Don't send anything else until our SYN has been acked */
2595
		if(tcb->snd.ptr != tcb->iss && (tcb->flags & SYNACK) == 0)
2596
			break;
2597
 
2598
		/* force an ack when a window has opened up */
2599
		tcprcvwin(s);
2600
		if(tcb->rcv.blocked && tcb->rcv.wnd > 0){
2601
			tcb->rcv.blocked = 0;
2602
			tcb->flags |= FORCE;
2603
		}
2604
 
2605
		sndcnt = qlen(s->wq)+tcb->flgcnt;
2606
		sent = tcb->snd.ptr - tcb->snd.una;
2607
		ssize = sndcnt;
2608
		if(tcb->snd.wnd == 0){
2609
			/* zero window probe */
2610
			if(sent > 0 && !(tcb->flags & FORCE))
2611
				break;	/* already probing, rto re-probes */
2612
			if(ssize < sent)
2613
				ssize = 0;
2614
			else{
2615
				ssize -= sent;
2616
				if(ssize > 0)
2617
					ssize = 1;
2618
			}
2619
		} else {
2620
			/* calculate usable segment size */
2621
			if(ssize > tcb->cwind)
2622
				ssize = tcb->cwind;
2623
			if(ssize > tcb->snd.wnd)
2624
				ssize = tcb->snd.wnd;
2625
 
2626
			if(ssize < sent)
2627
				ssize = 0;
2628
			else {
2629
				ssize -= sent;
2630
				if(ssize > tcb->mss)
2631
					ssize = tcb->mss;
2632
			}
2633
		}
2634
 
2635
		dsize = ssize;
2636
		seg.urg = 0;
2637
 
2638
		if(!(tcb->flags & FORCE))
2639
			if(ssize == 0 ||
2640
			    ssize < tcb->mss && tcb->snd.nxt == tcb->snd.ptr &&
2641
			    sent > TCPREXMTTHRESH * tcb->mss)
2642
				break;
2643
 
2644
		tcb->flags &= ~FORCE;
2645
 
2646
		/* By default we will generate an ack */
2647
		tcphalt(tpriv, &tcb->acktimer);
2648
		seg.source = s->lport;
2649
		seg.dest = s->rport;
2650
		seg.flags = ACK;
2651
		seg.mss = 0;
2652
		seg.ws = 0;
2653
		seg.update = 0;
2654
		switch(tcb->state){
2655
		case Syn_sent:
2656
			seg.flags = 0;
2657
			if(tcb->snd.ptr == tcb->iss){
2658
				seg.flags |= SYN;
2659
				dsize--;
2660
				seg.mss = tcb->mss;
2661
				seg.ws = tcb->scale;
2662
			}
2663
			break;
2664
		case Syn_received:
2665
			/*
2666
			 *  don't send any data with a SYN/ACK packet
2667
			 *  because Linux rejects the packet in its
2668
			 *  attempt to solve the SYN attack problem
2669
			 */
2670
			if(tcb->snd.ptr == tcb->iss){
2671
				seg.flags |= SYN;
2672
				dsize = 0;
2673
				ssize = 1;
2674
				seg.mss = tcb->mss;
2675
				seg.ws = tcb->scale;
2676
			}
2677
			break;
2678
		}
2679
		seg.seq = tcb->snd.ptr;
2680
		seg.ack = tcb->rcv.nxt;
2681
		seg.wnd = tcb->rcv.wnd;
2682
 
2683
		/* Pull out data to send */
2684
		bp = nil;
2685
		if(dsize != 0) {
2686
			bp = qcopy(s->wq, dsize, sent);
2687
			if(BLEN(bp) != dsize) {
2688
				seg.flags |= FIN;
2689
				dsize--;
2690
			}
2691
		}
2692
 
2693
		if(sent+dsize == sndcnt && dsize)
2694
			seg.flags |= PSH;
2695
 
2696
		tcb->snd.ptr += ssize;
2697
 
2698
		/* Pull up the send pointer so we can accept acks
2699
		 * for this window
2700
		 */
2701
		if(seq_gt(tcb->snd.ptr,tcb->snd.nxt))
2702
			tcb->snd.nxt = tcb->snd.ptr;
2703
 
2704
		/* Build header, link data and compute cksum */
2705
		switch(version){
2706
		case V4:
2707
			tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2708
			hbp = htontcp4(&seg, bp, &tcb->protohdr.tcp4hdr, tcb);
2709
			if(hbp == nil) {
2710
				freeblist(bp);
2711
				return;
2712
			}
2713
			break;
2714
		case V6:
2715
			tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2716
			hbp = htontcp6(&seg, bp, &tcb->protohdr.tcp6hdr, tcb);
2717
			if(hbp == nil) {
2718
				freeblist(bp);
2719
				return;
2720
			}
2721
			break;
2722
		default:
2723
			hbp = nil;	/* to suppress a warning */
2724
			panic("tcpoutput: version %d", version);
2725
		}
2726
 
2727
		/* Start the transmission timers if there is new data and we
2728
		 * expect acknowledges
2729
		 */
2730
		if(ssize != 0){
2731
			if(tcb->timer.state != TcptimerON){
2732
				tcb->time = NOW;
2733
				tcb->timeuna = tcb->snd.una;
2734
				tcpgo(tpriv, &tcb->timer);
2735
			}
2736
 
2737
			/*  If round trip timer isn't running, start it.
2738
			 *  measure the longest packet only in case the
2739
			 *  transmission time dominates RTT
2740
			 */
2741
			if(tcb->snd.retransmit == 0)
2742
			if(tcb->rtt_timer.state != TcptimerON)
2743
			if(ssize == tcb->mss) {
2744
				tcpgo(tpriv, &tcb->rtt_timer);
2745
				tcb->rttseq = tcb->snd.ptr;
2746
			}
2747
		}
2748
 
2749
		tpriv->stats[OutSegs]++;
2750
		if(tcb->snd.retransmit)
2751
			tpriv->stats[RetransSegsSent]++;
2752
		tcb->rcv.ackptr = seg.ack;
2753
		tcb->rcv.wsnt = tcb->rcv.wptr;
2754
 
2755
		/* put off the next keep alive */
2756
		tcpgo(tpriv, &tcb->katimer);
2757
 
2758
		switch(version){
2759
		case V4:
2760
			if(ipoput4(f, hbp, 0, s->ttl, s->tos, s) < 0){
2761
				/* a negative return means no route */
2762
				localclose(s, "no route");
2763
			}
2764
			break;
2765
		case V6:
2766
			if(ipoput6(f, hbp, 0, s->ttl, s->tos, s) < 0){
2767
				/* a negative return means no route */
2768
				localclose(s, "no route");
2769
			}
2770
			break;
2771
		default:
2772
			panic("tcpoutput2: version %d", version);
2773
		}
2774
		if((msgs%4) == 3){
2775
			qunlock(s);
2776
			qlock(s);
2777
		}
2778
	}
2779
}
2780
 
2781
/*
2782
 *  the BSD convention (hack?) for keep alives.  resend last uchar acked.
2783
 */
2784
static void
2785
tcpsendka(Conv *s)
2786
{
2787
	Tcp seg;
2788
	Tcpctl *tcb;
2789
	Block *hbp,*dbp;
2790
 
2791
	tcb = (Tcpctl*)s->ptcl;
2792
 
2793
	dbp = nil;
2794
	memset(&seg, 0, sizeof seg);
2795
	seg.urg = 0;
2796
	seg.source = s->lport;
2797
	seg.dest = s->rport;
2798
	seg.flags = ACK|PSH;
2799
	seg.mss = 0;
2800
	seg.ws = 0;
2801
	if(tcpporthogdefense)
2802
		seg.seq = tcb->snd.una-(1<<30)-nrand(1<<20);
2803
	else
2804
		seg.seq = tcb->snd.una-1;
2805
	seg.ack = tcb->rcv.nxt;
2806
	tcb->rcv.ackptr = seg.ack;
2807
	tcprcvwin(s);
2808
	seg.wnd = tcb->rcv.wnd;
2809
	if(tcb->state == Finwait2){
2810
		seg.flags |= FIN;
2811
	} else {
2812
		dbp = allocb(1);
2813
		dbp->wp++;
2814
	}
2815
 
2816
	if(isv4(s->raddr)) {
2817
		/* Build header, link data and compute cksum */
2818
		tcb->protohdr.tcp4hdr.vihl = IP_VER4;
2819
		hbp = htontcp4(&seg, dbp, &tcb->protohdr.tcp4hdr, tcb);
2820
		if(hbp == nil) {
2821
			freeblist(dbp);
2822
			return;
2823
		}
2824
		ipoput4(s->p->f, hbp, 0, s->ttl, s->tos, s);
2825
	}
2826
	else {
2827
		/* Build header, link data and compute cksum */
2828
		tcb->protohdr.tcp6hdr.vcf[0] = IP_VER6;
2829
		hbp = htontcp6(&seg, dbp, &tcb->protohdr.tcp6hdr, tcb);
2830
		if(hbp == nil) {
2831
			freeblist(dbp);
2832
			return;
2833
		}
2834
		ipoput6(s->p->f, hbp, 0, s->ttl, s->tos, s);
2835
	}
2836
}
2837
 
2838
/*
2839
 *  set connection to time out after 12 minutes
2840
 */
2841
static void
2842
tcpsetkacounter(Tcpctl *tcb)
2843
{
2844
	tcb->kacounter = (12 * 60 * 1000) / (tcb->katimer.start*MSPTICK);
2845
	if(tcb->kacounter < 3)
2846
		tcb->kacounter = 3;
2847
}
2848
 
2849
/*
2850
 *  if we've timed out, close the connection
2851
 *  otherwise, send a keepalive and restart the timer
2852
 */
2853
static void
2854
tcpkeepalive(void *v)
2855
{
2856
	Tcpctl *tcb;
2857
	Conv *s;
2858
 
2859
	s = v;
2860
	tcb = (Tcpctl*)s->ptcl;
2861
	if(waserror()){
2862
		qunlock(s);
2863
		nexterror();
2864
	}
2865
	qlock(s);
2866
	if(tcb->state != Closed){
2867
		if(--(tcb->kacounter) <= 0) {
2868
			localclose(s, Etimedout);
2869
		} else {
2870
			tcpsendka(s);
2871
			tcpgo(s->p->priv, &tcb->katimer);
2872
		}
2873
	}
2874
	qunlock(s);
2875
	poperror();
2876
}
2877
 
2878
/*
2879
 *  start keepalive timer
2880
 */
2881
static char*
2882
tcpstartka(Conv *s, char **f, int n)
2883
{
2884
	Tcpctl *tcb;
2885
	int x;
2886
 
2887
	tcb = (Tcpctl*)s->ptcl;
2888
	if(tcb->state != Established)
2889
		return "connection must be in Establised state";
2890
	if(n > 1){
2891
		x = atoi(f[1]);
2892
		if(x >= MSPTICK)
2893
			tcb->katimer.start = x/MSPTICK;
2894
	}
2895
	tcpsetkacounter(tcb);
2896
	tcpgo(s->p->priv, &tcb->katimer);
2897
 
2898
	return nil;
2899
}
2900
 
2901
/*
2902
 *  turn checksums on/off
2903
 */
2904
static char*
2905
tcpsetchecksum(Conv *s, char **f, int)
2906
{
2907
	Tcpctl *tcb;
2908
 
2909
	tcb = (Tcpctl*)s->ptcl;
2910
	tcb->nochecksum = !atoi(f[1]);
2911
 
2912
	return nil;
2913
}
2914
 
2915
/*
2916
 *  retransmit (at most) one segment at snd.una.
2917
 *  preserve cwind & snd.ptr
2918
 */
2919
static void
2920
tcprxmit(Conv *s)
2921
{
2922
	Tcpctl *tcb;
2923
	Tcppriv *tpriv;
2924
	ulong tcwind, tptr;
2925
 
2926
	tcb = (Tcpctl*)s->ptcl;
2927
	tcb->flags |= RETRAN|FORCE;
2928
 
2929
	tptr = tcb->snd.ptr;
2930
	tcwind = tcb->cwind;
2931
	tcb->snd.ptr = tcb->snd.una;
2932
	tcb->cwind = tcb->mss;
2933
	tcb->snd.retransmit = 1;
2934
	tcpoutput(s);
2935
	tcb->snd.retransmit = 0;
2936
	tcb->cwind = tcwind;
2937
	tcb->snd.ptr = tptr;
2938
 
2939
	tpriv = s->p->priv;
2940
	tpriv->stats[RetransSegs]++;
2941
}
2942
 
2943
/*
2944
 *  TODO: RFC 4138 F-RTO
2945
 */
2946
static void
2947
tcptimeout(void *arg)
2948
{
2949
	Conv *s;
2950
	Tcpctl *tcb;
2951
	int maxback;
2952
	Tcppriv *tpriv;
2953
 
2954
	s = (Conv*)arg;
2955
	tpriv = s->p->priv;
2956
	tcb = (Tcpctl*)s->ptcl;
2957
 
2958
	if(waserror()){
2959
		qunlock(s);
2960
		nexterror();
2961
	}
2962
	qlock(s);
2963
	switch(tcb->state){
2964
	default:
2965
		tcb->backoff++;
2966
		if(tcb->state == Syn_sent)
2967
			maxback = MAXBACKMS/2;
2968
		else
2969
			maxback = MAXBACKMS;
2970
		tcb->backedoff += tcb->timer.start * MSPTICK;
2971
		if(tcb->backedoff >= maxback) {
2972
			localclose(s, Etimedout);
2973
			break;
2974
		}
2975
		netlog(s->p->f, Logtcprxmt, "rxm %d/%d %ldms %lud rto %d %lud %s\n",
2976
			tcb->srtt, tcb->mdev, NOW - tcb->time,
2977
			tcb->snd.una - tcb->timeuna, tcb->snd.rto, tcb->snd.ptr,
2978
			tcpstates[s->state]);
2979
		tcpsettimer(tcb);
2980
		if(tcb->snd.rto == 0)
2981
			tcpcongestion(tcb);
2982
		tcprxmit(s);
2983
		tcb->snd.ptr = tcb->snd.una;
2984
		tcb->cwind = tcb->mss;
2985
		tcb->snd.rto = 1;
2986
		tpriv->stats[RetransTimeouts]++;
2987
 
2988
		if(tcb->snd.recovery){
2989
			tcb->snd.dupacks = 0;		/* reno rto */
2990
			tcb->snd.recovery = 0;
2991
			tpriv->stats[RecoveryRTO]++;
2992
			tcb->snd.rxt = tcb->snd.nxt;
2993
			netlog(s->p->f, Logtcpwin,
2994
				"rto recovery rxt @%lud\n", tcb->snd.nxt);
2995
		}
2996
 
2997
		tcb->abcbytes = 0;
2998
		break;
2999
	case Time_wait:
3000
		localclose(s, nil);
3001
		break;
3002
	case Closed:
3003
		break;
3004
	}
3005
	qunlock(s);
3006
	poperror();
3007
}
3008
 
3009
static int
3010
inwindow(Tcpctl *tcb, int seq)
3011
{
3012
	return seq_within(seq, tcb->rcv.nxt, tcb->rcv.nxt+tcb->rcv.wnd-1);
3013
}
3014
 
3015
/*
3016
 *  set up state for a received SYN (or SYN ACK) packet
3017
 */
3018
static void
3019
procsyn(Conv *s, Tcp *seg)
3020
{
3021
	Tcpctl *tcb;
3022
	Tcppriv *tpriv;
3023
 
3024
	tcb = (Tcpctl*)s->ptcl;
3025
	tcb->flags |= FORCE;
3026
 
3027
	tcb->rcv.nxt = seg->seq + 1;
3028
	tcb->rcv.wptr = tcb->rcv.nxt;
3029
	tcb->rcv.wsnt = 0;
3030
	tcb->rcv.urg = tcb->rcv.nxt;
3031
	tcb->irs = seg->seq;
3032
 
3033
	/* our sending max segment size cannot be bigger than what he asked for */
3034
	if(seg->mss != 0 && seg->mss < tcb->mss) {
3035
		tcb->mss = seg->mss;
3036
		tpriv = s->p->priv;
3037
		tpriv->stats[Mss] = tcb->mss;
3038
	}
3039
 
3040
	tcb->snd.wnd = seg->wnd;
3041
	initialwindow(tcb);
3042
}
3043
 
3044
static int
3045
dumpreseq(Tcpctl *tcb)
3046
{
3047
	Reseq *r, *next;
3048
 
3049
	for(r = tcb->reseq; r != nil; r = next){
3050
		next = r->next;
3051
		freeblist(r->bp);
3052
		free(r);
3053
	}
3054
	tcb->reseq = nil;
3055
	tcb->nreseq = 0;
3056
	tcb->reseqlen = 0;
3057
	return -1;
3058
}
3059
 
3060
static void
3061
logreseq(Fs *f, Reseq *r, ulong n)
3062
{
3063
	char *s;
3064
 
3065
	for(; r != nil; r = r->next){
3066
		s = nil;
3067
		if(r->next == nil && r->seg.seq != n)
3068
			s = "hole/end";
3069
		else if(r->next == nil)
3070
			s = "end";
3071
		else if(r->seg.seq != n)
3072
			s = "hole";
3073
		if(s != nil)
3074
			netlog(f, Logtcp, "%s %lud-%lud (%ld) %#ux\n", s,
3075
				n, r->seg.seq, r->seg.seq - n, r->seg.flags);
3076
		n = r->seg.seq + r->seg.len;
3077
	}
3078
}
3079
 
3080
static int
3081
addreseq(Fs *f, Tcpctl *tcb, Tcppriv *tpriv, Tcp *seg, Block *bp, ushort length)
3082
{
3083
	Reseq *rp, **rr;
3084
	int qmax;
3085
 
3086
	rp = malloc(sizeof *rp);
3087
	if(rp == nil){
3088
		freeblist(bp);		/* bp always consumed by addreseq */
3089
		return 0;
3090
	}
3091
 
3092
	rp->seg = *seg;
3093
	rp->bp = bp;
3094
	rp->length = length;
3095
 
3096
	tcb->reseqlen += length;
3097
	tcb->nreseq++;
3098
 
3099
	/* Place on reassembly list sorting by starting seq number */
3100
	for(rr = &tcb->reseq; ; rr = &(*rr)->next)
3101
		if(*rr == nil || seq_lt(seg->seq, (*rr)->seg.seq)){
3102
			rp->next = *rr;
3103
			*rr = rp;
3104
			tpriv->stats[Resequenced]++;
3105
			if(rp->next != nil)
3106
				tpriv->stats[OutOfOrder]++;
3107
			break;
3108
		}
3109
 
3110
	qmax = tcb->window;
3111
	if(tcb->reseqlen > qmax){
3112
		netlog(f, Logtcp, "tcp: reseq: queue > window: %d > %d; %d packets\n",
3113
			tcb->reseqlen, qmax, tcb->nreseq);
3114
		logreseq(f, tcb->reseq, tcb->rcv.nxt);
3115
		tpriv->stats[ReseqBytelim]++;
3116
		return dumpreseq(tcb);
3117
	}
3118
	qmax = tcb->window / tcb->mss; /* ~190 for qscale=2, 390 for qscale=3 */
3119
	if(tcb->nreseq > qmax){
3120
		netlog(f, Logtcp, "resequence queue > packets: %d %d; %d bytes\n",
3121
			tcb->nreseq, qmax, tcb->reseqlen);
3122
		logreseq(f, tcb->reseq, tcb->rcv.nxt);
3123
		tpriv->stats[ReseqPktlim]++;
3124
		return dumpreseq(tcb);
3125
	}
3126
	return 0;
3127
}
3128
 
3129
static void
3130
getreseq(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
3131
{
3132
	Reseq *rp;
3133
 
3134
	rp = tcb->reseq;
3135
	if(rp == nil)
3136
		return;
3137
 
3138
	tcb->reseq = rp->next;
3139
 
3140
	*seg = rp->seg;
3141
	*bp = rp->bp;
3142
	*length = rp->length;
3143
 
3144
	tcb->nreseq--;
3145
	tcb->reseqlen -= rp->length;
3146
 
3147
	free(rp);
3148
}
3149
 
3150
static int
3151
tcptrim(Tcpctl *tcb, Tcp *seg, Block **bp, ushort *length)
3152
{
3153
	ushort len;
3154
	uchar accept;
3155
	int dupcnt, excess;
3156
 
3157
	accept = 0;
3158
	len = *length;
3159
	if(seg->flags & SYN)
3160
		len++;
3161
	if(seg->flags & FIN)
3162
		len++;
3163
 
3164
	if(tcb->rcv.wnd == 0) {
3165
		if(len == 0 && seg->seq == tcb->rcv.nxt)
3166
			return 0;
3167
	}
3168
	else {
3169
		/* Some part of the segment should be in the window */
3170
		if(inwindow(tcb,seg->seq))
3171
			accept++;
3172
		else
3173
		if(len != 0) {
3174
			if(inwindow(tcb, seg->seq+len-1) ||
3175
			seq_within(tcb->rcv.nxt, seg->seq,seg->seq+len-1))
3176
				accept++;
3177
		}
3178
	}
3179
	if(!accept) {
3180
		freeblist(*bp);
3181
		return -1;
3182
	}
3183
	dupcnt = tcb->rcv.nxt - seg->seq;
3184
	if(dupcnt > 0){
3185
		tcb->rerecv += dupcnt;
3186
		if(seg->flags & SYN){
3187
			seg->flags &= ~SYN;
3188
			seg->seq++;
3189
 
3190
			if(seg->urg > 1)
3191
				seg->urg--;
3192
			else
3193
				seg->flags &= ~URG;
3194
			dupcnt--;
3195
		}
3196
		if(dupcnt > 0){
3197
			pullblock(bp, (ushort)dupcnt);
3198
			seg->seq += dupcnt;
3199
			*length -= dupcnt;
3200
 
3201
			if(seg->urg > dupcnt)
3202
				seg->urg -= dupcnt;
3203
			else {
3204
				seg->flags &= ~URG;
3205
				seg->urg = 0;
3206
			}
3207
		}
3208
	}
3209
	excess = seg->seq + *length - (tcb->rcv.nxt + tcb->rcv.wnd);
3210
	if(excess > 0) {
3211
		tcb->rerecv += excess;
3212
		*length -= excess;
3213
		*bp = trimblock(*bp, 0, *length);
3214
		if(*bp == nil)
3215
			panic("presotto is a boofhead");
3216
		seg->flags &= ~FIN;
3217
	}
3218
	return 0;
3219
}
3220
 
3221
static void
3222
tcpadvise(Proto *tcp, Block *bp, char *msg)
3223
{
3224
	Tcp4hdr *h4;
3225
	Tcp6hdr *h6;
3226
	Tcpctl *tcb;
3227
	uchar source[IPaddrlen];
3228
	uchar dest[IPaddrlen];
3229
	ushort psource, pdest;
3230
	Conv *s, **p;
3231
 
3232
	h4 = (Tcp4hdr*)(bp->rp);
3233
	h6 = (Tcp6hdr*)(bp->rp);
3234
 
3235
	if((h4->vihl&0xF0)==IP_VER4) {
3236
		v4tov6(dest, h4->tcpdst);
3237
		v4tov6(source, h4->tcpsrc);
3238
		psource = nhgets(h4->tcpsport);
3239
		pdest = nhgets(h4->tcpdport);
3240
	}
3241
	else {
3242
		ipmove(dest, h6->tcpdst);
3243
		ipmove(source, h6->tcpsrc);
3244
		psource = nhgets(h6->tcpsport);
3245
		pdest = nhgets(h6->tcpdport);
3246
	}
3247
 
3248
	/* Look for a connection */
3249
	qlock(tcp);
3250
	for(p = tcp->conv; *p; p++) {
3251
		s = *p;
3252
		tcb = (Tcpctl*)s->ptcl;
3253
		if(s->rport == pdest)
3254
		if(s->lport == psource)
3255
		if(tcb->state != Closed)
3256
		if(ipcmp(s->raddr, dest) == 0)
3257
		if(ipcmp(s->laddr, source) == 0){
3258
			qlock(s);
3259
			qunlock(tcp);
3260
			switch(tcb->state){
3261
			case Syn_sent:
3262
				localclose(s, msg);
3263
				break;
3264
			}
3265
			qunlock(s);
3266
			freeblist(bp);
3267
			return;
3268
		}
3269
	}
3270
	qunlock(tcp);
3271
	freeblist(bp);
3272
}
3273
 
3274
static char*
3275
tcpporthogdefensectl(char *val)
3276
{
3277
	if(strcmp(val, "on") == 0)
3278
		tcpporthogdefense = 1;
3279
	else if(strcmp(val, "off") == 0)
3280
		tcpporthogdefense = 0;
3281
	else
3282
		return "unknown value for tcpporthogdefense";
3283
	return nil;
3284
}
3285
 
3286
/* called with c qlocked */
3287
static char*
3288
tcpctl(Conv* c, char** f, int n)
3289
{
3290
	if(n == 1 && strcmp(f[0], "hangup") == 0)
3291
		return tcphangup(c);
3292
	if(n >= 1 && strcmp(f[0], "keepalive") == 0)
3293
		return tcpstartka(c, f, n);
3294
	if(n >= 1 && strcmp(f[0], "checksum") == 0)
3295
		return tcpsetchecksum(c, f, n);
3296
	if(n >= 1 && strcmp(f[0], "tcpporthogdefense") == 0)
3297
		return tcpporthogdefensectl(f[1]);
3298
	return "unknown control request";
3299
}
3300
 
3301
static int
3302
tcpstats(Proto *tcp, char *buf, int len)
3303
{
3304
	Tcppriv *priv;
3305
	char *p, *e;
3306
	int i;
3307
 
3308
	priv = tcp->priv;
3309
	p = buf;
3310
	e = p+len;
3311
	for(i = 0; i < Nstats; i++)
3312
		p = seprint(p, e, "%s: %llud\n", statnames[i], priv->stats[i]);
3313
	return p - buf;
3314
}
3315
 
3316
/*
3317
 *  garbage collect any stale conversations:
3318
 *	- SYN received but no SYN-ACK after 5 seconds (could be the SYN attack)
3319
 *	- Finwait2 after 5 minutes
3320
 *
3321
 *  this is called whenever we run out of channels.  Both checks are
3322
 *  of questionable validity so we try to use them only when we're
3323
 *  up against the wall.
3324
 */
3325
static int
3326
tcpgc(Proto *tcp)
3327
{
3328
	Conv *c, **pp, **ep;
3329
	int n;
3330
	Tcpctl *tcb;
3331
 
3332
 
3333
	n = 0;
3334
	ep = &tcp->conv[tcp->nc];
3335
	for(pp = tcp->conv; pp < ep; pp++) {
3336
		c = *pp;
3337
		if(c == nil)
3338
			break;
3339
		if(!canqlock(c))
3340
			continue;
3341
		tcb = (Tcpctl*)c->ptcl;
3342
		switch(tcb->state){
3343
		case Syn_received:
3344
			if(NOW - tcb->time > 5000){
3345
				localclose(c, Etimedout);
3346
				n++;
3347
			}
3348
			break;
3349
		case Finwait2:
3350
			if(NOW - tcb->time > 5*60*1000){
3351
				localclose(c, Etimedout);
3352
				n++;
3353
			}
3354
			break;
3355
		}
3356
		qunlock(c);
3357
	}
3358
	return n;
3359
}
3360
 
3361
static void
3362
tcpsettimer(Tcpctl *tcb)
3363
{
3364
	int x;
3365
 
3366
	/* round trip dependency */
3367
	x = backoff(tcb->backoff) *
3368
		(tcb->mdev + (tcb->srtt>>LOGAGAIN) + MSPTICK) / MSPTICK;
3369
 
3370
	/* bounded twixt 0.3 and 64 seconds */
3371
	if(x < 300/MSPTICK)
3372
		x = 300/MSPTICK;
3373
	else if(x > (64000/MSPTICK))
3374
		x = 64000/MSPTICK;
3375
	tcb->timer.start = x;
3376
}
3377
 
3378
void
3379
tcpinit(Fs *fs)
3380
{
3381
	Proto *tcp;
3382
	Tcppriv *tpriv;
3383
 
3384
	tcp = smalloc(sizeof(Proto));
3385
	tpriv = tcp->priv = smalloc(sizeof(Tcppriv));
3386
	tcp->name = "tcp";
3387
	tcp->connect = tcpconnect;
3388
	tcp->announce = tcpannounce;
3389
	tcp->ctl = tcpctl;
3390
	tcp->state = tcpstate;
3391
	tcp->create = tcpcreate;
3392
	tcp->close = tcpclose;
3393
	tcp->rcv = tcpiput;
3394
	tcp->advise = tcpadvise;
3395
	tcp->stats = tcpstats;
3396
	tcp->inuse = tcpinuse;
3397
	tcp->gc = tcpgc;
3398
	tcp->ipproto = IP_TCPPROTO;
3399
	tcp->nc = scalednconv();
3400
	tcp->ptclsize = sizeof(Tcpctl);
3401
	tpriv->stats[MaxConn] = tcp->nc;
3402
 
3403
	Fsproto(fs, tcp);
3404
}
3405
 
3406
static void
3407
tcpsetscale(Conv *s, Tcpctl *tcb, ushort rcvscale, ushort sndscale)
3408
{
3409
	/*
3410
	 * guess at reasonable queue sizes.  there's no current way
3411
	 * to know how many nic receive buffers we can safely tie up in the
3412
	 * tcp stack, and we don't adjust our queues to maximize throughput
3413
	 * and minimize bufferbloat.  n.b. the offer (rcvscale) needs to be
3414
	 * respected, but we still control our own buffer commitment by
3415
	 * keeping a seperate qscale.
3416
	 */
3417
	tcb->rcv.scale = rcvscale & 0xff;
3418
	tcb->snd.scale = sndscale & 0xff;
3419
	tcb->qscale = rcvscale & 0xff;
3420
	if(rcvscale > Maxqscale)
3421
		tcb->qscale = Maxqscale;
3422
 
3423
	if(rcvscale != tcb->rcv.scale)
3424
		netlog(s->p->f, Logtcp, "tcpsetscale: window %lud "
3425
			"qlen %d >> window %ud lport %d\n",
3426
			tcb->window, qlen(s->rq), QMAX<<tcb->qscale, s->lport);
3427
	tcb->window = QMAX << tcb->qscale;
3428
	tcb->ssthresh = tcb->window;
3429
 
3430
	/*
3431
	 * it's important to set wq large enough to cover the full
3432
	 * bandwidth-delay product.  it's possible to be in loss
3433
	 * recovery with a big window, and we need to keep sending
3434
	 * into the inflated window.  the difference can be huge
3435
	 * for even modest (70ms) ping times.
3436
	 */
3437
	qsetlimit(s->rq, tcb->window);
3438
	qsetlimit(s->wq, tcb->window);
3439
	tcprcvwin(s);
3440
}