Subversion Repositories planix.SVN

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 - 1
#include <u.h>
2
#include <libc.h>
3
#include <bio.h>
4
#include <ctype.h>
5
#include "code.h"
6
 
7
/* fig leaves for possibly signed char quantities */
8
#define ISUPPER(c)	isupper((c)&0xff)
9
#define ISLOWER(c)	islower((c)&0xff)
10
#define	ISALPHA(c)	isalpha((c)&0xff)
11
#define	ISDIGIT(c)	isdigit((c)&0xff)
12
#define ISVOWEL(c)	voweltab[(c)&0xff]
13
#define Tolower(c)	(ISUPPER(c)? (c)-'A'+'a': (c))
14
#define pair(a,b)	(((a)<<8) | (b))
15
#define DLEV		2
16
#define DSIZ		40
17
 
18
typedef	long	Bits;
19
#define	Set(h, f)	((long)(h) & (f))
20
 
21
Bits 	nop(char*, char*, char*, int, int);
22
Bits 	strip(char*, char*, char*, int, int);
23
Bits 	ize(char*, char*, char*, int, int);
24
Bits 	i_to_y(char*, char*, char*, int, int);
25
Bits 	ily(char*, char*, char*, int, int);
26
Bits 	subst(char*, char*, char*, int, int);
27
Bits 	CCe(char*, char*, char*, int, int);
28
Bits 	tion(char*, char*, char*, int, int);
29
Bits 	an(char*, char*, char*, int, int);
30
Bits 	s(char*, char*, char*, int, int);
31
Bits 	es(char*, char*, char*, int, int);
32
Bits 	bility(char*, char*, char*, int, int);
33
Bits 	y_to_e(char*, char*, char*, int, int);
34
Bits 	VCe(char*, char*, char*, int, int);
35
 
36
Bits 	trypref(char*, char*, int, int);
37
Bits	tryword(char*, char*, int, int);
38
Bits 	trysuff(char*, int, int);
39
Bits	dict(char*, char*);
40
void	typeprint(Bits);
41
void	pcomma(char*);
42
 
43
void	ise(void);
44
int	ordinal(void);
45
char*	skipv(char*);
46
int	inun(char*, Bits);
47
char*	ztos(char*);
48
void	readdict(char*);
49
 
50
typedef	struct	Ptab	Ptab;
51
struct	Ptab
52
{
53
	char*	s;
54
	int	flag;
55
};
56
 
57
typedef	struct	Suftab	Suftab;
58
struct	Suftab
59
{
60
	char	*suf;
61
	Bits	(*p1)(char*, char*, char*, int, int);
62
	int	n1;
63
	char	*d1;
64
	char	*a1;
65
	int	flag;
66
	int	affixable;
67
	Bits	(*p2)(char*, char*, char*, int, int);
68
	int	n2;
69
	char	*d2;
70
	char	*a2;
71
};
72
 
73
Suftab	staba[] = {
74
	{"aibohp",subst,1,"-e+ia","",NOUN, NOUN},
75
 
76
};
77
 
78
Suftab	stabc[] =
79
{
80
	{"cai",strip,1,"","+c",N_AFFIX, ADJ|NOUN},
81
	{"citsi",strip,2,"","+ic",N_AFFIX, ADJ | N_AFFIX | NOUN},
82
	{"citi",ize,1,"-e+ic","",N_AFFIX, ADJ },
83
	{"cihparg",i_to_y,1,"-y+ic","",NOUN, ADJ|NOUN },
84
	{"cipocs",ize,1,"-e+ic","",NOUN, ADJ },
85
	{"cirtem",i_to_y,1,"-y+ic","",NOUN, ADJ },
86
	{"cigol",i_to_y,1,"-y+ic","",NOUN, ADJ },
87
	{"cimono",i_to_y,1,"-y+ic","",NOUN, ADJ },
88
	{"cibohp",subst,1,"-e+ic","",NOUN, ADJ },
89
 
90
};
91
Suftab	stabd[] =
92
{
93
	{"de",strip,1,"","+d",ED,ADJ |COMP,i_to_y,2,"-y+ied","+ed"},
94
	{"dooh",ily,4,"-y+ihood","+hood",NOUN | ADV, NOUN},
95
 
96
};
97
Suftab	stabe[] =
98
{
99
	/*
100
	 * V_affix for comment ->commence->commentment??
101
	 */
102
	{"ecna",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
103
	{"ecne",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
104
	{"elbaif",i_to_y,4,"-y+iable","",V_IRREG,ADJ},
105
	{"elba",CCe,4,"-e+able","+able",V_AFFIX,ADJ},
106
	{"evi",subst,0,"-ion+ive","",N_AFFIX | V_AFFIX,NOUN | N_AFFIX| ADJ},
107
	{"ezi",CCe,3,"-e+ize","+ize",N_AFFIX|ADJ ,V_AFFIX | VERB |ION | COMP},
108
	{"ekil",strip,4,"","+like",N_AFFIX ,ADJ},
109
 
110
};
111
Suftab	stabg[] =
112
{
113
	{"gniee",strip,3,"","+ing",V_IRREG ,ADJ|NOUN},
114
	{"gnikam",strip,6,"","+making",NOUN,NOUN},
115
	{"gnipeek",strip,7,"","+keeping",NOUN,NOUN},
116
	{"gni",CCe,3,"-e+ing","+ing",V_IRREG ,ADJ|ED|NOUN},
117
 
118
};
119
Suftab	stabl[] =
120
{
121
	{"ladio",strip,2,"","+al",NOUN |ADJ,ADJ},
122
	{"laci",strip,2,"","+al",NOUN |ADJ,ADJ |NOUN|N_AFFIX},
123
	{"latnem",strip,2,"","+al",N_AFFIX,ADJ},
124
	{"lanoi",strip,2,"","+al",N_AFFIX,ADJ|NOUN},
125
	{"luf",ily,3,"-y+iful","+ful",N_AFFIX,ADJ | NOUN},
126
 
127
};
128
Suftab	stabm[] =
129
{
130
		/* congregational + ism */
131
	{"msi",CCe,3,"-e+ism","ism",N_AFFIX|ADJ,NOUN},
132
	{"margo",subst,-1,"-ph+m","",NOUN,NOUN},
133
 
134
};
135
Suftab	stabn[] =
136
{
137
	{"noitacifi",i_to_y,6,"-y+ication","",ION,NOUN | N_AFFIX},
138
	{"noitazi",ize,4,"-e+ation","",ION,NOUN| N_AFFIX},
139
	{"noit",tion,3,"-e+ion","+ion",ION,NOUN| N_AFFIX | V_AFFIX |VERB|ACTOR},
140
	{"naino",an,3,"","+ian",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
141
	{"namow",strip,5,"","+woman",MAN,PROP_COLLECT|N_AFFIX},
142
	{"nam",strip,3,"","+man",MAN,PROP_COLLECT | N_AFFIX | VERB},
143
	{"na",an,1,"","+n",NOUN|PROP_COLLECT,NOUN | N_AFFIX},
144
	{"nemow",strip,5,"","+women",MAN,PROP_COLLECT},
145
	{"nem",strip,3,"","+man",MAN,PROP_COLLECT},
146
	{"nosrep",strip,6,"","+person",MAN,PROP_COLLECT},
147
 
148
};
149
Suftab	stabp[] =
150
{
151
	{"pihs",strip,4,"","+ship",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
152
 
153
};
154
Suftab	stabr[] =
155
{
156
	{"rehparg",subst,1,"-y+er","",ACTOR,NOUN,strip,2,"","+er"},
157
	{"reyhparg",nop,0,"","",0,NOUN},
158
	{"reyl",nop,0,"","",0,NOUN},
159
	{"rekam",strip,5,"","+maker",NOUN,NOUN},
160
	{"repeek",strip,6,"","+keeper",NOUN,NOUN},
161
	{"re",strip,1,"","+r",ACTOR,NOUN | N_AFFIX|VERB|ADJ,	i_to_y,2,"-y+ier","+er"},
162
	{"rota",tion,2,"-e+or","",ION,NOUN| N_AFFIX|_Y},
163
	{"rotc",tion,2,"","+or",ION,NOUN| N_AFFIX},
164
	{"rotp",tion,2,"","+or",ION,NOUN| N_AFFIX},
165
 
166
};
167
Suftab	stabs[] =
168
{
169
	{"ssen",ily,4,"-y+iness","+ness",ADJ|ADV,NOUN| N_AFFIX},
170
	{"ssel",ily,4,"-y+iless","+less",NOUN | PROP_COLLECT,ADJ },
171
	{"se",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH ,	es,2,"-y+ies","+es"},
172
	{"s'",s,2,"","+'s",PROP_COLLECT | NOUN,DONT_TOUCH },
173
	{"s",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH  },
174
 
175
};
176
Suftab	stabt[] =
177
{
178
	{"tnem",strip,4,"","+ment",V_AFFIX,NOUN | N_AFFIX | ADJ|VERB},
179
	{"tse",strip,2,"","+st",EST,DONT_TOUCH,	i_to_y,3,"-y+iest","+est" },
180
	{"tsigol",i_to_y,2,"-y+ist","",N_AFFIX,NOUN | N_AFFIX},
181
	{"tsi",CCe,3,"-e+ist","+ist",N_AFFIX|ADJ,NOUN | N_AFFIX|COMP},
182
 
183
};
184
Suftab	staby[] =
185
{
186
	{"ycna",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
187
	{"ycne",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
188
	{"ytilib",bility,5,"-le+ility","",ADJ | V_AFFIX,NOUN | N_AFFIX},
189
	{"ytisuo",nop,0,"","",NOUN},
190
	{"ytilb",nop,0,"","",0,NOUN},
191
	{"yti",CCe,3,"-e+ity","+ity",ADJ ,NOUN | N_AFFIX },
192
	{"ylb",y_to_e,1,"-e+y","",ADJ,ADV},
193
	{"ylc",nop,0,"","",0},
194
	{"ylelb",nop,0,"","",0},
195
	{"ylelp",nop,0,"","",0},
196
	{"yl",ily,2,"-y+ily","+ly",ADJ,ADV|COMP},
197
	{"yrtem",subst,0,"-er+ry","",NOUN,NOUN | N_AFFIX},
198
	{"y",CCe,1,"-e+y","+y",_Y,ADJ|COMP},
199
 
200
};
201
Suftab	stabz[] =
202
{
203
 
204
};
205
Suftab*	suftab[] =
206
{
207
	staba,
208
	stabz,
209
	stabc,
210
	stabd,
211
	stabe,
212
	stabz,
213
	stabg,
214
	stabz,
215
	stabz,
216
	stabz,
217
	stabz,
218
	stabl,
219
	stabm,
220
	stabn,
221
	stabz,
222
	stabp,
223
	stabz,
224
	stabr,
225
	stabs,
226
	stabt,
227
	stabz,
228
	stabz,
229
	stabz,
230
	stabz,
231
	staby,
232
	stabz,
233
};
234
 
235
Ptab	ptaba[] =
236
{
237
	"anti", 0,
238
	"auto", 0,
239
 
240
};
241
Ptab	ptabb[] =
242
{
243
	"bio", 0,
244
 
245
};
246
Ptab	ptabc[] =
247
{
248
	"counter", 0,
249
 
250
};
251
Ptab	ptabd[] =
252
{
253
	"dis", 0,
254
 
255
};
256
Ptab	ptabe[] =
257
{
258
	"electro", 0,
259
 
260
};
261
Ptab	ptabf[] =
262
{
263
	"femto", 0,
264
 
265
};
266
Ptab	ptabg[] =
267
{
268
	"geo", 0,
269
	"giga", 0,
270
 
271
};
272
Ptab	ptabh[] =
273
{
274
	"hyper", 0,
275
 
276
};
277
Ptab	ptabi[] =
278
{
279
	"immuno", 0,
280
	"im", IN,
281
	"intra", 0,
282
	"inter", 0,
283
	"in", IN,
284
	"ir", IN,
285
	"iso", 0,
286
 
287
};
288
Ptab	ptabj[] =
289
{
290
 
291
};
292
Ptab	ptabk[] =
293
{
294
	"kilo", 0,
295
 
296
};
297
Ptab	ptabl[] =
298
{
299
 
300
};
301
Ptab	ptabm[] =
302
{
303
	"magneto", 0,
304
	"mega", 0,
305
	"meta", 0,
306
	"micro", 0,
307
	"mid", 0,
308
	"milli", 0,
309
	"mini", 0,
310
	"mis", 0,
311
	"mono", 0,
312
	"multi", 0,
313
 
314
};
315
Ptab	ptabn[] =
316
{
317
	"nano", 0,
318
	"neuro", 0,
319
	"non", 0,
320
 
321
};
322
Ptab	ptabo[] =
323
{
324
	"out", 0,
325
	"over", 0,
326
 
327
};
328
Ptab	ptabp[] =
329
{
330
	"para", 0,
331
	"photo", 0,
332
	"pico", 0,
333
	"poly", 0,
334
	"pre", 0,
335
	"pseudo", 0,
336
	"psycho", 0,
337
 
338
};
339
Ptab	ptabq[] =
340
{
341
	"quasi", 0,
342
 
343
};
344
Ptab	ptabr[] =
345
{
346
	"radio", 0,
347
	"re", 0,
348
 
349
};
350
Ptab	ptabs[] =
351
{
352
	"semi", 0,
353
	"stereo", 0,
354
	"sub", 0,
355
	"super", 0,
356
 
357
};
358
Ptab	ptabt[] =
359
{
360
	"tele", 0,
361
	"tera", 0,
362
	"thermo", 0,
363
 
364
};
365
Ptab	ptabu[] =
366
{
367
	"ultra", 0,
368
	"under", 0,	/*must precede un*/
369
	"un", IN,
370
 
371
};
372
Ptab	ptabv[] =
373
{
374
 
375
};
376
Ptab	ptabw[] =
377
{
378
 
379
};
380
Ptab	ptabx[] =
381
{
382
 
383
};
384
Ptab	ptaby[] =
385
{
386
 
387
};
388
Ptab	ptabz[] =
389
{
390
 
391
};
392
 
393
Ptab*	preftab[] =
394
{
395
	ptaba,
396
	ptabb,
397
	ptabc,
398
	ptabd,
399
	ptabe,
400
	ptabf,
401
	ptabg,
402
	ptabh,
403
	ptabi,
404
	ptabj,
405
	ptabk,
406
	ptabl,
407
	ptabm,
408
	ptabn,
409
	ptabo,
410
	ptabp,
411
	ptabq,
412
	ptabr,
413
	ptabs,
414
	ptabt,
415
	ptabu,
416
	ptabv,
417
	ptabw,
418
	ptabx,
419
	ptaby,
420
	ptabz,
421
};
422
 
423
typedef struct {
424
	char *mesg;
425
	enum { NONE, SUFF, PREF} type;
426
} Deriv;
427
 
428
int	aflag;
429
int	cflag;
430
int	fflag;
431
int	vflag;
432
int	xflag;
433
int 	nflag;
434
char	word[500];
435
char*	original;
436
Deriv	emptyderiv;
437
Deriv	deriv[DSIZ+3];
438
char	affix[DSIZ*10];	/* 10 is longest affix message */
439
int	prefcount;
440
int 	suffcount;
441
char*	acmeid;
442
char	space[300000];	/* must be as large as "words"+"space" in pcode run */
443
Bits	encode[2048];	/* must be as long as "codes" in pcode run */
444
int	nencode;
445
char	voweltab[256];
446
char*	spacep[128*128+1];	/* pointer to words starting with 'xx' */
447
Biobuf	bin;
448
Biobuf	bout;
449
 
450
char*	codefile = "/sys/lib/amspell";
451
char*	brfile = "/sys/lib/brspell";
452
char*	Usage = "usage";
453
 
454
void
455
main(int argc, char *argv[])
456
{
457
	char *ep, *cp;
458
	char *dp;
459
	int j, i, c;
460
	int low;
461
	Bits h;
462
 
463
	Binit(&bin, 0, OREAD);
464
	Binit(&bout, 1, OWRITE);
465
	for(i=0; c = "aeiouyAEIOUY"[i]; i++)
466
		voweltab[c] = 1;
467
	while(argc > 1) {
468
		if(argv[1][0] != '-')
469
			break;
470
		for(i=1; c = argv[1][i]; i++)
471
		switch(c) {
472
		default:
473
			fprint(2, "usage: spell [-bcCvx] [-f file]\n");
474
			exits(Usage);
475
 
476
		case 'a':
477
			aflag++;
478
			continue;
479
 
480
		case 'b':
481
			ise();
482
			if(!fflag)
483
				codefile = brfile;
484
			continue;
485
 
486
		case 'C':		/* for "correct" */
487
			vflag++;
488
		case 'c':		/* for ocr */
489
			cflag++;
490
			continue;
491
 
492
		case 'v':
493
			vflag++;
494
			continue;
495
 
496
		case 'x':
497
			xflag++;
498
			continue;
499
 
500
		case 'f':
501
			if(argc <= 2) {
502
				fprint(2, "spell: -f requires another argument\n");
503
				exits(Usage);
504
			}
505
			argv++;
506
			argc--;
507
			codefile = argv[1];
508
			fflag++;
509
			goto brk;
510
		}
511
	brk:
512
		argv++;
513
		argc--;
514
	}
515
	readdict(codefile);
516
	if(argc > 1) {
517
		fprint(2, "usage: spell [-bcCvx] [-f file]\n");
518
		exits(Usage);
519
	}
520
	if(aflag)
521
		cflag = vflag = 0;
522
 
523
	for(;;) {
524
		affix[0] = 0;
525
		original = Brdline(&bin, '\n');
526
		if(original == 0)
527
			exits(0);
528
		original[Blinelen(&bin)-1] = 0;
529
		low = 0;
530
 
531
		if(aflag) {
532
			acmeid = original;
533
			while(*original != ':')
534
				if(*original++ == 0)
535
					exits(0);
536
			while(*++original != ':')
537
				if(*original == 0)
538
					exits(0);
539
			*original++ = 0;
540
		}
541
		for(ep=word,dp=original; j = *dp; ep++,dp++) {
542
			if(ISLOWER(j))
543
				low++;
544
			if(ep >= word+sizeof(word)-1)
545
				break;
546
			*ep = j;
547
		}
548
		*ep = 0;
549
 
550
		if(ISDIGIT(word[0]) && ordinal())
551
			continue;
552
 
553
		h = 0;
554
		if(!low && !(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH)))
555
			for(cp=original+1,dp=word+1; dp<ep; dp++,cp++)
556
				*dp = Tolower(*cp);
557
		if(!h)
558
		for(;;) {	/* at most twice */
559
			if(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH))
560
				break;
561
			if(h = trysuff(ep,0,ALL|STOP|DONT_TOUCH))
562
				break;
563
			if(!ISUPPER(word[0]))
564
				break;
565
			cp = original;
566
			dp = word;
567
			while(*dp = *cp++) {
568
					if(!low)
569
						*dp = Tolower(*dp);
570
				dp++;
571
			}
572
			word[0] = Tolower(word[0]);
573
		}
574
 
575
		if(cflag) {
576
			if(!h || Set(h,STOP))
577
				print("-");
578
			else if(!vflag)
579
				print("+");
580
			else 
581
				print("%c",'0' + (suffcount>0) +
582
				   (prefcount>4? 8: 2*prefcount));
583
		} else if(!h || Set(h,STOP)) {
584
			if(aflag)
585
				Bprint(&bout, "%s:%s\n", acmeid, original);
586
			else
587
				Bprint(&bout, "%s\n", original);
588
		} else if(affix[0] != 0 && affix[0] != '.')
589
			print("%s\t%s\n", affix, original);
590
	}
591
	/* not reached */
592
}
593
 
594
/*	strip exactly one suffix and do
595
 *	indicated routine(s), which may recursively
596
 *	strip suffixes
597
 */
598
Bits
599
trysuff(char* ep, int lev, int flag)
600
{
601
	Suftab *t;
602
	char *cp, *sp;
603
	Bits h = 0;
604
	int initchar = ep[-1];
605
 
606
	flag &= ~MONO;
607
	lev += DLEV;
608
	if(lev < DSIZ) {
609
		deriv[lev]  = emptyderiv;
610
		deriv[lev-1] = emptyderiv;
611
	}
612
	if(!ISLOWER(initchar))
613
		return h;
614
	for(t=suftab[initchar-'a']; sp=t->suf; t++) {
615
		cp = ep;
616
		while(*sp)
617
			if(*--cp != *sp++)
618
				goto next;
619
		for(sp=ep-t->n1; --sp >= word && !ISVOWEL(*sp);)
620
			;
621
		if(sp < word)
622
			continue;
623
		if(!(t->affixable & flag))
624
			return 0;
625
		h = (*t->p1)(ep-t->n1, t->d1, t->a1, lev+1, t->flag|STOP);
626
		if(!h && t->p2!=0) {
627
			if(lev < DSIZ) {
628
				deriv[lev] = emptyderiv;
629
				deriv[lev+1] = emptyderiv;
630
			}
631
			h = (*t->p2)(ep-t->n2, t->d2, t->a2, lev, t->flag|STOP);
632
		}
633
		break;
634
	next:;
635
	}
636
	return h;
637
}
638
 
639
Bits
640
nop(char* ep, char* d, char* a, int lev, int flag)
641
{
642
	USED(ep, d, a, lev, flag);
643
	return 0;
644
}
645
 
646
Bits
647
cstrip(char* ep, char* d, char* a, int lev, int flag)
648
{
649
	int temp = ep[0];
650
 
651
	if(ISVOWEL(temp) && ISVOWEL(ep[-1])) {
652
		switch(pair(ep[-1],ep[0])) {
653
		case pair('a', 'a'):
654
		case pair('a', 'e'):
655
		case pair('a', 'i'):
656
		case pair('e', 'a'):
657
		case pair('e', 'e'):
658
		case pair('e', 'i'):
659
		case pair('i', 'i'):
660
		case pair('o', 'a'):
661
			return 0;
662
		}
663
	} else
664
	if(temp==ep[-1]&&temp==ep[-2])
665
		return 0;
666
	return strip(ep,d,a,lev,flag);
667
}
668
 
669
Bits
670
strip(char* ep, char* d, char* a, int lev, int flag)
671
{
672
	Bits h = trypref(ep, a, lev, flag);
673
 
674
	USED(d);
675
	if(Set(h,MONO) && ISVOWEL(*ep) && ISVOWEL(ep[-2]))
676
		h = 0;
677
	if(h)
678
		return h;
679
	if(ISVOWEL(*ep) && !ISVOWEL(ep[-1]) && ep[-1]==ep[-2]) {
680
		h = trypref(ep-1,a,lev,flag|MONO);
681
		if(h)
682
			return h;
683
	}
684
	return trysuff(ep,lev,flag);
685
}
686
 
687
Bits
688
s(char* ep, char* d, char* a, int lev, int flag)
689
{
690
	if(lev > DLEV+1)
691
		return 0;
692
	if(*ep=='s') {
693
		switch(ep[-1]) {
694
		case 'y':
695
			if(ISVOWEL(ep[-2])||ISUPPER(*word))
696
				break;	/*says Kennedys*/
697
		case 'x':
698
		case 'z':
699
		case 's':
700
			return 0;
701
		case 'h':
702
			switch(ep[-2]) {
703
			case 'c':
704
			case 's':
705
				return 0;
706
			}
707
		}
708
	}
709
	return strip(ep,d,a,lev,flag);
710
}
711
 
712
Bits
713
an(char* ep, char* d, char* a, int lev, int flag)
714
{
715
	USED(d);
716
	if(!ISUPPER(*word))	/*must be proper name*/
717
		return 0;
718
	return trypref(ep,a,lev,flag);
719
}
720
 
721
Bits
722
ize(char* ep, char* d, char* a, int lev, int flag)
723
{
724
	int temp = ep[-1];
725
	Bits h;
726
 
727
	USED(a);
728
	ep[-1] = 'e';
729
	h = strip(ep,"",d,lev,flag);
730
	ep[-1] = temp;
731
	return h;
732
}
733
 
734
Bits
735
y_to_e(char* ep, char* d, char* a, int lev, int flag)
736
{
737
	Bits h;
738
	int  temp;
739
 
740
	USED(a);
741
	switch(ep[-1]) {
742
	case 'a':
743
	case 'e':
744
	case 'i':
745
		return 0;
746
	}
747
	temp = *ep;
748
	*ep++ = 'e';
749
	h = strip(ep,"",d,lev,flag);
750
	ep[-1] = temp;
751
	return h;
752
}
753
 
754
Bits
755
ily(char* ep, char* d, char* a, int lev, int flag)
756
{
757
	int temp = ep[0];
758
	char *cp = ep;
759
 
760
	if(temp==ep[-1]&&temp==ep[-2])		/* sillly */
761
		return 0;
762
	if(*--cp=='y' && !ISVOWEL(*--cp))	/* happyly */
763
		while(cp>word)
764
			if(ISVOWEL(*--cp))	/* shyness */
765
				return 0;
766
	if(ep[-1]=='i')
767
		return i_to_y(ep,d,a,lev,flag);
768
	return cstrip(ep,d,a,lev,flag);
769
}
770
 
771
Bits
772
bility(char* ep, char* d, char* a, int lev, int flag)
773
{
774
	*ep++ = 'l';
775
	return y_to_e(ep,d,a,lev,flag);
776
}
777
 
778
Bits
779
i_to_y(char* ep, char* d, char* a, int lev, int flag)
780
{
781
	Bits h;
782
	int temp;
783
 
784
	if(ISUPPER(*word))
785
		return 0;
786
	if((temp=ep[-1])=='i' && !ISVOWEL(ep[-2])) {
787
		ep[-1] = 'y';
788
		a = d;
789
	}
790
	h = cstrip(ep,"",a,lev,flag);
791
	ep[-1] = temp;
792
	return h;
793
}
794
 
795
Bits
796
es(char* ep, char* d, char* a, int lev, int flag)
797
{
798
	if(lev>DLEV)
799
		return 0;
800
	switch(ep[-1]) {
801
	default:
802
		return 0;
803
	case 'i':
804
		return i_to_y(ep,d,a,lev,flag);
805
	case 'h':
806
		switch(ep[-2]) {
807
		default:
808
			return 0;
809
		case 'c':
810
		case 's':
811
			break;
812
		}
813
	case 's':
814
	case 'z':
815
	case 'x':
816
		return strip(ep,d,a,lev,flag);
817
	}
818
}
819
 
820
Bits
821
subst(char* ep, char* d, char* a, int lev, int flag)
822
{
823
	char *u,*t;
824
	Bits h;
825
 
826
	USED(a);
827
	if(skipv(skipv(ep-1)) < word)
828
		return 0;
829
	for(t=d; *t!='+'; t++)
830
		continue;
831
	for(u=ep; *--t!='-';)
832
		*--u = *t;
833
	h = strip(ep,"",d,lev,flag);
834
	while(*++t != '+')
835
		continue;
836
	while(*++t)
837
		*u++ = *t;
838
	return h;
839
}
840
 
841
Bits
842
tion(char* ep, char* d, char* a, int lev, int flag)
843
{
844
	switch(ep[-2]) {
845
	default:
846
		return trypref(ep,a,lev,flag);
847
	case 'a':
848
	case 'e':
849
	case 'i':
850
	case 'o':
851
	case 'u':
852
		return y_to_e(ep,d,a,lev,flag);
853
	}
854
}
855
 
856
/*
857
 * possible consonant-consonant-e ending
858
 */
859
Bits
860
CCe(char* ep, char* d, char* a, int lev, int flag)
861
{
862
	Bits h;
863
 
864
	switch(ep[-1]) {
865
	case 'l':
866
		if(ISVOWEL(ep[-2]))
867
			break;
868
		switch(ep[-2]) {
869
		case 'l':
870
		case 'r':
871
		case 'w':
872
			break;
873
		default:
874
			return y_to_e(ep,d,a,lev,flag);
875
		}
876
		break;
877
	case 'c':
878
	case 'g':
879
		if(*ep == 'a')	/* prevent -able for -eable */
880
			return 0;
881
	case 's':
882
	case 'v':
883
	case 'z':
884
		if(ep[-2]==ep[-1])
885
			break;
886
		if(ISVOWEL(ep[-2]))
887
			break;
888
	case 'u':
889
		if(h = y_to_e(ep,d,a,lev,flag))
890
			return h;
891
		if(!(ep[-2]=='n' && ep[-1]=='g'))
892
			return 0;
893
	}
894
	return VCe(ep,d,a,lev,flag);
895
}
896
 
897
/*
898
 * possible consonant-vowel-consonant-e ending
899
 */
900
Bits
901
VCe(char* ep, char* d, char* a, int lev, int flag)
902
{
903
	int c;
904
	Bits h;
905
 
906
	c = ep[-1];
907
	if(c=='e')
908
		return 0;
909
	if(!ISVOWEL(c) && ISVOWEL(ep[-2])) {
910
		c = *ep;
911
		*ep++ = 'e';
912
		h = trypref(ep,d,lev,flag);
913
		if(!h)
914
			h = trysuff(ep,lev,flag);
915
		if(h)
916
			return h;
917
		ep--;
918
		*ep = c;
919
	}
920
	return cstrip(ep,d,a,lev,flag);
921
}
922
 
923
Ptab*
924
lookuppref(uchar** wp, char* ep)
925
{
926
	Ptab *sp;
927
	uchar *bp,*cp;
928
	unsigned int initchar = Tolower(**wp);
929
 
930
	if(!ISALPHA(initchar))
931
		return 0;
932
	for(sp=preftab[initchar-'a'];sp->s;sp++) {
933
		bp = *wp;
934
		for(cp= (uchar*)sp->s;*cp; )
935
			if(*bp++!=*cp++)
936
				goto next;
937
		for(cp=bp;cp<(uchar*)ep;cp++)
938
			if(ISVOWEL(*cp)) {
939
				*wp = bp;
940
				return sp;
941
			}
942
	next:;
943
	}
944
	return 0;
945
}
946
 
947
/*	while word is not in dictionary try stripping
948
 *	prefixes. Fail if no more prefixes.
949
 */
950
Bits
951
trypref(char* ep, char* a, int lev, int flag)
952
{
953
	Ptab *tp;
954
	char *bp, *cp;
955
	char *pp;
956
	Bits h;
957
	char space[20];
958
 
959
	if(lev<DSIZ) {
960
		deriv[lev].mesg = a;
961
		deriv[lev].type = *a=='.'? NONE: SUFF;
962
	}
963
	if(h = tryword(word,ep,lev,flag)) {
964
		if(Set(h, flag&~MONO) && (flag&MONO) <= Set(h, MONO))
965
			return h;
966
		h = 0;
967
	}
968
	bp = word;
969
	pp = space;
970
	if(lev<DSIZ) {
971
		deriv[lev+1].mesg = pp;
972
		deriv[lev+1].type = 0;
973
	}
974
	while(tp=lookuppref((uchar**)&bp,ep)) {
975
		*pp++ = '+';
976
		cp = tp->s;
977
		while(pp<space+sizeof(space) && (*pp = *cp++))
978
			pp++;
979
		deriv[lev+1].type += PREF;
980
		h = tryword(bp,ep,lev+1,flag);
981
		if(Set(h,NOPREF) ||
982
		   ((tp->flag&IN) && inun(bp-2,h)==0)) {
983
			h = 0;
984
			break;
985
		}
986
		if(Set(h,flag&~MONO) && (flag&MONO) <= Set(h, MONO))
987
			break;
988
		h = 0;
989
	}
990
	if(lev < DSIZ) {
991
		deriv[lev+1] = emptyderiv;
992
		deriv[lev+2] = emptyderiv;
993
	}
994
	return h;
995
}
996
 
997
Bits
998
tryword(char* bp, char* ep, int lev, int flag)
999
{
1000
	int  j;
1001
	Bits h = 0;
1002
	char duple[3];
1003
 
1004
	if(ep-bp <= 1)
1005
		return h;
1006
	if(flag&MONO) {
1007
		if(lev<DSIZ) {
1008
			deriv[++lev].mesg = duple;
1009
			deriv[lev].type = SUFF;
1010
		}
1011
		duple[0] = '+';
1012
		duple[1] = *ep;
1013
		duple[2] = 0;
1014
	}
1015
	h = dict(bp, ep);
1016
	if(vflag==0 || h==0)
1017
		return h;
1018
	/*
1019
	 * when derivations are wanted, collect them
1020
	 * for printing
1021
	 */
1022
	j = lev;
1023
	prefcount = suffcount = 0;
1024
	do {
1025
		if(j<DSIZ && deriv[j].type) {
1026
			strcat(affix, deriv[j].mesg);
1027
			if(deriv[j].type == SUFF)
1028
				suffcount++;
1029
			else if(deriv[j].type != NONE)
1030
				prefcount = deriv[j].type/PREF;
1031
		}
1032
	} while(--j > 0);
1033
	return h;
1034
}
1035
 
1036
int
1037
inun(char* bp, Bits h)
1038
{
1039
	if(*bp == 'u')
1040
		return Set(h, IN) == 0;
1041
	/* *bp == 'i' */
1042
	if(Set(h, IN) == 0)
1043
		return 0;
1044
	switch(bp[2]) {
1045
	case 'r':
1046
		return bp[1] == 'r';
1047
	case 'm':
1048
	case 'p':
1049
		return bp[1] == 'm';
1050
	}
1051
	return bp[1] == 'n';
1052
}
1053
 
1054
char*
1055
skipv(char *s)
1056
{
1057
	if(s >= word && ISVOWEL(*s))
1058
		s--;
1059
	while(s >= word && !ISVOWEL(*s))
1060
		s--;
1061
	return s;
1062
}
1063
 
1064
/*
1065
 * crummy way to Britishise
1066
 */
1067
void
1068
ise(void)
1069
{
1070
	Suftab *p;
1071
	int i;
1072
 
1073
	for(i=0; i<26; i++)
1074
		for(p = suftab[i]; p->suf; p++) {
1075
			p->suf = ztos(p->suf);
1076
			p->d1 = ztos(p->d1);
1077
			p->a1 = ztos(p->a1);
1078
		}
1079
}
1080
 
1081
char*
1082
ztos(char *as)
1083
{
1084
	char *s, *ds;
1085
 
1086
	for(s=as; *s; s++)
1087
		if(*s == 'z')
1088
			goto copy;
1089
	return as;
1090
 
1091
copy:
1092
	ds = strdup(as);
1093
	for(s=ds; *s; s++)
1094
		if(*s == 'z')
1095
			*s = 's';
1096
	return ds;
1097
}
1098
 
1099
Bits
1100
dict(char* bp, char* ep)
1101
{
1102
	char *cp, *cp1, *w, *wp, *we;
1103
	int n, f;
1104
 
1105
	w = bp;
1106
	we = ep;
1107
	n = ep-bp;
1108
	if(n <= 1)
1109
		return NOUN;
1110
 
1111
	f = w[0] & 0x7f;
1112
	f *= 128;
1113
	f += w[1] & 0x7f;
1114
	bp = spacep[f];
1115
	ep = spacep[f+1];
1116
 
1117
loop:
1118
	if(bp >= ep) {
1119
		if(xflag) 
1120
			fprint(2, "=%.*s\n", utfnlen(w, n), w);
1121
		return 0;
1122
	}
1123
	/*
1124
	 * find the beginning of some word in the middle
1125
	 */
1126
	cp = bp + (ep-bp)/2;
1127
 
1128
	while(cp > bp && !(*cp & 0x80))
1129
		cp--;
1130
	while(cp > bp && (cp[-1] & 0x80))
1131
		cp--;
1132
 
1133
	wp = w + 2;	/* skip two letters */
1134
	cp1 = cp + 2;	/* skip affix code */
1135
	for(;;) {
1136
		if(wp >= we) {
1137
			if(*cp1 & 0x80)
1138
				goto found;
1139
			else
1140
				f = 1;
1141
			break;
1142
		}
1143
		if(*cp1 & 0x80) {
1144
			f = -1;
1145
			break;
1146
		}
1147
		f = *cp1++ - *wp++;
1148
		if(f != 0)
1149
			break;
1150
	}
1151
 
1152
	if(f < 0) {
1153
		while(!(*cp1 & 0x80))
1154
			cp1++;
1155
		bp = cp1;
1156
		goto loop;
1157
	}
1158
	ep = cp;
1159
	goto loop;
1160
 
1161
found:
1162
	f = ((cp[0] & 0x7) << 8) |
1163
		(cp[1] & 0xff);
1164
	if(xflag) {
1165
		fprint(2, "=%.*s ", utfnlen(w, n), w);
1166
		typeprint(encode[f]);
1167
	}
1168
	return encode[f];
1169
}
1170
 
1171
void
1172
typeprint(Bits h)
1173
{
1174
 
1175
	pcomma("");
1176
	if(h & NOUN)
1177
		pcomma("n");
1178
	if(h & PROP_COLLECT)
1179
		pcomma("pc");
1180
	if(h & VERB) {
1181
		if((h & VERB) == VERB)
1182
			pcomma("v");
1183
		else
1184
		if((h & VERB) == V_IRREG)
1185
			pcomma("vi");
1186
		else
1187
		if(h & ED)
1188
			pcomma("ed");
1189
	}
1190
	if(h & ADJ)
1191
		pcomma("a");
1192
	if(h & COMP) {
1193
		if((h & COMP) == ACTOR)
1194
			pcomma("er");
1195
		else
1196
			pcomma("comp");
1197
	}
1198
	if(h & DONT_TOUCH)
1199
		pcomma("d");
1200
	if(h & N_AFFIX)
1201
		pcomma("na");
1202
	if(h & ADV)
1203
		pcomma("adv");
1204
	if(h & ION)
1205
		pcomma("ion");
1206
	if(h & V_AFFIX)
1207
		pcomma("va");
1208
	if(h & MAN)
1209
		pcomma("man");
1210
	if(h & NOPREF)
1211
		pcomma("nopref");
1212
	if(h & MONO)
1213
		pcomma("ms");
1214
	if(h & IN)
1215
		pcomma("in");
1216
	if(h & _Y)
1217
		pcomma("y");
1218
	if(h & STOP)
1219
		pcomma("s");
1220
	fprint(2, "\n");
1221
}
1222
 
1223
void
1224
pcomma(char *s)
1225
{
1226
	static flag;
1227
 
1228
	if(*s == 0) {
1229
		flag = 0;
1230
		return;
1231
	}
1232
	if(!flag) {
1233
		fprint(2, "%s", s);
1234
		flag = 1;
1235
	} else
1236
		fprint(2, ",%s", s);
1237
}
1238
 
1239
/*
1240
 * is the word on of the following
1241
 *	12th	teen
1242
 *	21st	end in 1
1243
 *	23rd	end in 3
1244
 *	77th	default
1245
 * called knowing word[0] is a digit
1246
 */
1247
int
1248
ordinal(void)
1249
{
1250
	char *cp = word;
1251
	static char sp[4];
1252
 
1253
	while(ISDIGIT(*cp))
1254
		cp++;
1255
	strncpy(sp,cp,3);
1256
	if(ISUPPER(cp[0]) && ISUPPER(cp[1])) {
1257
		sp[0] = Tolower(cp[0]);
1258
		sp[1] = Tolower(cp[1]);
1259
	}
1260
	return 0 == strncmp(sp,
1261
		cp[-2]=='1'? "th":	/* out of bounds if 1 digit */
1262
		*--cp=='1'? "st":	/* harmless */
1263
		*cp=='2'? "nd":
1264
		*cp=='3'? "rd":
1265
		"th", 3);
1266
}
1267
 
1268
/*
1269
 * read in the dictionary.
1270
 * format is
1271
 * {
1272
 *	short	nencode;
1273
 *	long	encode[nencode];
1274
 *	char	space[*];
1275
 * };
1276
 *
1277
 * the encodings are a table all different
1278
 * affixes.
1279
 * the dictionary proper has 2 bytes
1280
 * that demark and then the rest of the
1281
 * word. the 2 bytes have the following
1282
 *	0x80 0x00	flag
1283
 *	0x78 0x00	count of prefix bytes
1284
 *			common with prev word
1285
 *	0x07 0xff	affix code
1286
 *
1287
 * all ints are big endians in the file.
1288
 */
1289
void
1290
readdict(char *file)
1291
{
1292
	char *s, *is, *lasts, *ls;
1293
	int c, i, sp, p;
1294
	int f;
1295
	long l;
1296
 
1297
	lasts = 0;
1298
	f = open(file, 0);
1299
	if(f == -1) {
1300
		fprint(2, "cannot open %s\n", file);
1301
		exits("open");
1302
	}
1303
	if(read(f, space, 2) != 2)
1304
		goto bad;
1305
	nencode = ((space[0]&0xff)<<8) | (space[1]&0xff);
1306
	if(read(f, space, 4*nencode) != 4*nencode)
1307
		goto bad;
1308
	s = space;
1309
	for(i=0; i<nencode; i++) {
1310
		l = (long)(s[0] & 0xff) << 24;
1311
		l |= (s[1] & 0xff) << 16;
1312
		l |= (s[2] & 0xff) << 8;
1313
		l |= s[3] & 0xff;
1314
		encode[i] = (Bits)l;
1315
		s += 4;
1316
	}
1317
	l = read(f, space, sizeof(space));
1318
	if(l == sizeof(space))
1319
		goto noroom;
1320
	is = space + (sizeof(space) - l);
1321
	memmove(is, space, l);
1322
 
1323
	s = space;
1324
	c = *is++ & 0xff;
1325
	sp = -1;
1326
	i = 0;
1327
 
1328
loop:
1329
	if(s > is)
1330
		goto noroom;
1331
	if(c < 0) {
1332
		close(f);
1333
		while(sp < 128*128)
1334
			spacep[++sp] = s;
1335
		*s = 0x80;		/* fence */
1336
		return;
1337
	}
1338
	p = (c>>3) & 0xf;
1339
	*s++ = c;
1340
	*s++ = *is++ & 0xff;
1341
	if(p <= 0)
1342
		i = (*is++ & 0xff)*128;
1343
	if(p <= 1) {
1344
		if(!(*is & 0x80))
1345
			i = i/128*128 + (*is++ & 0xff);
1346
		if(i <= sp) {
1347
			fprint(2, "the dict isnt sorted or \n");
1348
			fprint(2, "memmove didn't work\n");
1349
			goto bad;
1350
		}
1351
		while(sp < i)
1352
			spacep[++sp] = s-2;
1353
	}
1354
	ls = lasts;
1355
	lasts = s;
1356
	for(p-=2; p>0; p--)
1357
		*s++ = *ls++;
1358
	for(;;) {
1359
		if(is >= space+sizeof(space)) {
1360
			c = -1;
1361
			break;
1362
		}
1363
		c = *is++ & 0xff;
1364
		if(c & 0x80)
1365
			break;
1366
		*s++ = c;
1367
	}
1368
	*s = 0;
1369
	goto loop;
1370
 
1371
bad:
1372
	fprint(2, "trouble reading %s\n", file);
1373
	exits("read");
1374
noroom:
1375
	fprint(2, "not enough space for dictionary\n");
1376
	exits("space");
1377
}