WebSVN – planix.SVN – Blame – /os/branches/feature_unix/sys/src/cmd/spell/pcode.c

Rev	Author	Line No.	Line
2	-	1	`#include <u.h>`
		2	`#include <libc.h>`
		3	`#include <bio.h>`
		4	`#include <ctype.h>`
		5	`#include "code.h"`
		6
		7	`/* read an annotated spelling list in the form`
		8	`word <tab> affixcode [ , affixcode ] ...`
		9	`print a reencoded version`
		10	`octal <tab> word`
		11	`*/`
		12
		13	`typedef struct Dict Dict;`
		14	`struct Dict`
		15	`{`
		16	`char* word;`
		17	`int encode;`
		18	`};`
		19
		20	`Dict words[200000];`
		21	`char space[500000];`
		22	`long encodes[4094];`
		23	`long nspace;`
		24	`long nwords;`
		25	`int ncodes;`
		26	`Biobuf bout;`
		27
		28	`void readinput(int f);`
		29	`long typecode(char *str);`
		30	`int wcmp(void, void);`
		31	`void pdict(void);`
		32	`void sput(int);`
		33
		34	`void`
		35	`main(int argc, char *argv[])`
		36	`{`
		37	`int f;`
		38
		39	`Binit(&bout, 1, OWRITE);`
		40	`nwords = 0;`
		41	`nspace = 0;`
		42	`ncodes = 0;`
		43	`if(argc <= 1)`
		44	`readinput(0);`
		45	`while(argc > 1) {`
		46	`f = open(argv[1], 0);`
		47	`if(f < 0) {`
		48	`fprint(2, "Cannot open %s\n", argv[1]);`
		49	`exits("open");`
		50	`}`
		51	`readinput(f);`
		52	`argc--;`
		53	`argv++;`
		54	`}`
		55	`fprint(2, "words = %ld; space = %ld; codes = %d\n",`
		56	`nwords, nspace, ncodes);`
		57	`qsort(words, nwords, sizeof(words[0]), wcmp);`
		58	`pdict();`
		59	`exits(0);`
		60	`}`
		61
		62	`wcmp(void a, void b)`
		63	`{`
		64
		65	`return strcmp(((Dict)a)->word, ((Dict)b)->word);`
		66	`}`
		67
		68	`void`
		69	`readinput(int f)`
		70	`{`
		71	`long i;`
		72	`char code, line, *bword;`
		73	`Biobuf buf;`
		74	`long lineno = 0;`
		75
		76	`Binit(&buf, f, OREAD);`
		77	`while(line = Brdline(&buf, '\n')) {`
		78	`line[Blinelen(&buf)-1] = 0;`
		79	`lineno++;`
		80	`code = line;`
		81	`while(isspace(*code))`
		82	`code++;`
		83	`bword = code;`
		84	`while(code && !isspace(code))`
		85	`code++;`
		86
		87	`i = code-bword;`
		88	`memmove(space+nspace, bword, i);`
		89	`words[nwords].word = space+nspace;`
		90	`nspace += i;`
		91	`space[nspace] = 0;`
		92	`nspace++;`
		93
		94	`if(*code) {`
		95	`*code++ = 0;`
		96	`while(isspace(*code))`
		97	`code++;`
		98	`}`
		99	`words[nwords].encode = typecode(code);`
		100	`nwords++;`
		101	`if(nwords >= sizeof(words)/sizeof(words[0])) {`
		102	`fprint(2, "words array too small\n");`
		103	`exits("words");`
		104	`}`
		105	`if(nspace >= sizeof(space)/sizeof(space[0])) {`
		106	`fprint(2, "space array too small\n");`
		107	`exits("space");`
		108	`}`
		109	`}`
		110	`Bterm(&buf);`
		111	`}`
		112
		113
		114	`typedef struct Class Class;`
		115	`struct Class`
		116	`{`
		117	`char* codename;`
		118	`long bits;`
		119	`};`
		120	`Class codea[] =`
		121	`{`
		122	`{ "a", ADJ },`
		123	`{ "adv", ADV },`
		124
		125	`};`
		126	`Class codec[] =`
		127	`{`
		128	`{ "comp", COMP },`
		129
		130	`};`
		131	`Class coded[] =`
		132	`{`
		133	`{ "d", DONT_TOUCH},`
		134
		135	`};`
		136
		137	`Class codee[] =`
		138	`{`
		139	`{ "ed", ED },`
		140	`{ "er", ACTOR },`
		141
		142	`};`
		143
		144	`Class codei[] =`
		145	`{`
		146	`{ "in", IN },`
		147	`{ "ion", ION },`
		148
		149	`};`
		150
		151	`Class codem[] =`
		152	`{`
		153	`{ "man", MAN },`
		154	`{ "ms", MONO },`
		155
		156	`};`
		157
		158	`Class coden[] =`
		159	`{`
		160	`{ "n", NOUN },`
		161	`{ "na", N_AFFIX },`
		162	`{ "nopref", NOPREF },`
		163
		164	`};`
		165
		166	`Class codep[] =`
		167	`{`
		168	`{ "pc", PROP_COLLECT },`
		169
		170	`};`
		171	`Class codes[] =`
		172	`{`
		173	`{ "s", STOP },`
		174
		175	`};`
		176
		177	`Class codev[] =`
		178	`{`
		179	`{ "v", VERB },`
		180	`{ "va", V_AFFIX },`
		181	`{ "vi", V_IRREG },`
		182
		183	`};`
		184
		185	`Class codey[] =`
		186	`{`
		187	`{ "y", _Y },`
		188
		189	`};`
		190
		191	`Class codez[] =`
		192	`{`
		193
		194	`};`
		195	`Class* codetab[] =`
		196	`{`
		197	`codea,`
		198	`codez,`
		199	`codec,`
		200	`coded,`
		201	`codee,`
		202	`codez,`
		203	`codez,`
		204	`codez,`
		205	`codei,`
		206	`codez,`
		207	`codez,`
		208	`codez,`
		209	`codem,`
		210	`coden,`
		211	`codez,`
		212	`codep,`
		213	`codez,`
		214	`codez,`
		215	`codes,`
		216	`codez,`
		217	`codez,`
		218	`codev,`
		219	`codez,`
		220	`codez,`
		221	`codey,`
		222	`codez,`
		223	`};`
		224
		225	`long`
		226	`typecode(char *str)`
		227	`{`
		228	`Class *p;`
		229	`long code;`
		230	`int n, i;`
		231	`char s, sp, *st;`
		232
		233	`code = 0;`
		234
		235	`loop:`
		236	`for(s=str; s != 0 && s != ','; s++)`
		237	`;`
		238	`for(p = codetab[*str-'a']; sp = p->codename; p++) {`
		239	`st = str;`
		240	`for(n=s-str;; st++,sp++) {`
		241	`if(st != sp)`
		242	`goto cont;`
		243	`n--;`
		244	`if(n == 0)`
		245	`break;`
		246	`}`
		247	`code \|= p->bits;`
		248	`if(*s == 0)`
		249	`goto out;`
		250	`str = s+1;`
		251	`goto loop;`
		252	`cont:;`
		253	`}`
		254	`fprint(2, "Unknown affix code \"%s\"\n", str);`
		255	`return 0;`
		256	`out:`
		257	`for(i=0; i<ncodes; i++)`
		258	`if(encodes[i] == code)`
		259	`return i;`
		260	`encodes[i] = code;`
		261	`ncodes++;`
		262	`return i;`
		263	`}`
		264
		265	`void`
		266	`sput(int s)`
		267	`{`
		268
		269	`Bputc(&bout, s>>8);`
		270	`Bputc(&bout, s);`
		271	`}`
		272
		273	`void`
		274	`lput(long l)`
		275	`{`
		276	`Bputc(&bout, l>>24);`
		277	`Bputc(&bout, l>>16);`
		278	`Bputc(&bout, l>>8);`
		279	`Bputc(&bout, l);`
		280	`}`
		281
		282	`/*`
		283	`* spit out the encoded dictionary`
		284	`* all numbers are encoded big-endian.`
		285	`* struct`
		286	`* {`
		287	`* short ncodes;`
		288	`* long encodes[ncodes];`
		289	`* struct`
		290	`* {`
		291	`* short encode;`
		292	`* char word[*];`
		293	`* } words[*];`
		294	`* };`
		295	`* 0x8000 flag for code word`
		296	`* 0x7800 count of number of common bytes with previous word`
		297	`* 0x07ff index into codes array for affixes`
		298	`*/`
		299	`void`
		300	`pdict(void)`
		301	`{`
		302	`long i, count;`
		303	`int encode, j, c;`
		304	`char lastword, thisword, *word;`
		305
		306	`sput(ncodes);`
		307	`for(i=0; i<ncodes; i++)`
		308	`lput(encodes[i]);`
		309
		310	`count = ncodes*4 + 2;`
		311	`lastword = "";`
		312	`for(i=0; i<nwords; i++) {`
		313	`word = words[i].word;`
		314	`thisword = word;`
		315	`for(j=0; thisword == lastword; j++) {`
		316	`if(*thisword == 0) {`
		317	`fprint(2, "identical words: %s\n", word);`
		318	`break;`
		319	`}`
		320	`thisword++;`
		321	`lastword++;`
		322	`}`
		323	`if(j > 15)`
		324	`j = 15;`
		325	`encode = words[i].encode;`
		326	`c = (1<<15) \| (j<<11) \| encode;`
		327	`sput(c);`
		328	`count += 2;`
		329	`for(thisword=word+j; c = *thisword; thisword++) {`
		330	`Bputc(&bout, c);`
		331	`count++;`
		332	`}`
		333	`lastword = word;`
		334	`}`
		335	`fprint(2, "output bytes = %ld\n", count);`
		336	`}`

Subversion Repositories planix.SVN

(root)/os/branches/feature_unix/sys/src/cmd/spell/pcode.c – Rev 60