Subversion Repositories planix.SVN

Rev

Rev 2 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 - 1
#include <u.h>
2
#include <libc.h>
3
#include <bio.h>
4
#include <ctype.h>
5
#include "code.h"
6
 
7
/* read an annotated spelling list in the form
8
	word <tab> affixcode [ , affixcode ] ...
9
   print a reencoded version
10
	octal <tab> word
11
 */
12
 
13
typedef	struct	Dict	Dict;
14
struct	Dict
15
{
16
	char*	word;
17
	int	encode;
18
};
19
 
20
Dict	words[200000];
21
char	space[500000];
22
long	encodes[4094];
23
long	nspace;
24
long	nwords;
25
int	ncodes;
26
Biobuf	bout;
27
 
28
void	readinput(int f);
29
long	typecode(char *str);
30
int	wcmp(void*, void*);
31
void	pdict(void);
32
void	sput(int);
33
 
34
void
35
main(int argc, char *argv[])
36
{
37
	int f;
38
 
39
	Binit(&bout, 1, OWRITE);
40
	nwords = 0;
41
	nspace = 0;
42
	ncodes = 0;
43
	if(argc <= 1)
44
		readinput(0);
45
	while(argc > 1) {
46
		f = open(argv[1], 0);
47
		if(f < 0) {
48
			fprint(2, "Cannot open %s\n", argv[1]);
49
			exits("open");
50
		}
51
		readinput(f);
52
		argc--;
53
		argv++;
54
	}
55
	fprint(2, "words = %ld; space = %ld; codes = %d\n",
56
		nwords, nspace, ncodes);
57
	qsort(words, nwords, sizeof(words[0]), wcmp);
58
	pdict();
59
	exits(0);
60
}
61
 
62
wcmp(void *a, void *b)
63
{
64
 
65
	return strcmp(((Dict*)a)->word, ((Dict*)b)->word);
66
}
67
 
68
void
69
readinput(int f)
70
{
71
	long i;
72
	char *code, *line, *bword;
73
	Biobuf buf;
74
	long lineno = 0;
75
 
76
	Binit(&buf, f, OREAD);
77
	while(line = Brdline(&buf, '\n')) {
78
		line[Blinelen(&buf)-1] = 0;
79
		lineno++;
80
		code = line;
81
		while(isspace(*code))
82
			code++;
83
		bword = code;
84
		while(*code && !isspace(*code))
85
			code++;
86
 
87
		i = code-bword;
88
		memmove(space+nspace, bword, i);
89
		words[nwords].word = space+nspace;
90
		nspace += i;
91
		space[nspace] = 0;
92
		nspace++;
93
 
94
		if(*code) {
95
			*code++ = 0;
96
			while(isspace(*code))
97
				code++;
98
		}
99
		words[nwords].encode = typecode(code);
100
		nwords++;
101
		if(nwords >= sizeof(words)/sizeof(words[0])) {
102
			fprint(2, "words array too small\n");
103
			exits("words");
104
		}
105
		if(nspace >= sizeof(space)/sizeof(space[0])) {
106
			fprint(2, "space array too small\n");
107
			exits("space");
108
		}
109
	}
110
	Bterm(&buf);
111
}
112
 
113
 
114
typedef	struct	Class	Class;
115
struct	Class
116
{
117
	char*	codename;
118
	long	bits;
119
};
120
Class	codea[]  =
121
{
122
	{ "a", ADJ },
123
	{ "adv", ADV },
124
 
125
};
126
Class	codec[] =
127
{
128
	{ "comp", COMP },
129
 
130
};
131
Class	coded[] =
132
{
133
	{ "d", DONT_TOUCH},
134
 
135
};
136
 
137
Class	codee[] =
138
{
139
	{ "ed",	ED },
140
	{ "er", ACTOR },
141
 
142
};
143
 
144
Class	codei[] =
145
{
146
	{ "in", IN },
147
	{ "ion", ION },
148
 
149
};
150
 
151
Class	codem[] =
152
{
153
	{ "man", MAN },
154
	{ "ms", MONO },
155
 
156
};
157
 
158
Class	coden[] =
159
{
160
	{ "n", NOUN },
161
	{ "na", N_AFFIX },
162
	{ "nopref", NOPREF },
163
 
164
};
165
 
166
Class	codep[] =
167
{
168
	{ "pc", PROP_COLLECT },
169
 
170
};
171
Class	codes[] =
172
{
173
	{ "s", STOP },
174
 
175
};
176
 
177
Class	codev[] =
178
{
179
	{ "v", VERB },
180
	{ "va", V_AFFIX },
181
	{ "vi", V_IRREG },
182
 
183
};
184
 
185
Class	codey[] =
186
{
187
	{ "y", _Y },
188
 
189
};
190
 
191
Class	codez[] =
192
{
193
 
194
};
195
Class*	codetab[] =
196
{
197
	codea,
198
	codez,
199
	codec,
200
	coded,
201
	codee,
202
	codez,
203
	codez,
204
	codez,
205
	codei,
206
	codez,
207
	codez,
208
	codez,
209
	codem,
210
	coden,
211
	codez,
212
	codep,
213
	codez,
214
	codez,
215
	codes,
216
	codez,
217
	codez,
218
	codev,
219
	codez,
220
	codez,
221
	codey,
222
	codez,
223
};
224
 
225
long
226
typecode(char *str)
227
{
228
	Class *p;
229
	long code;
230
	int n, i;
231
	char *s, *sp, *st;
232
 
233
	code = 0;
234
 
235
loop:
236
	for(s=str; *s != 0 && *s != ','; s++)
237
		;
238
	for(p = codetab[*str-'a']; sp = p->codename; p++) {
239
		st = str;
240
		for(n=s-str;; st++,sp++) {
241
			if(*st != *sp)
242
				goto cont;
243
			n--;
244
			if(n == 0)
245
				break;
246
		}
247
		code |= p->bits;
248
		if(*s == 0)
249
			goto out;
250
		str = s+1;
251
		goto loop;
252
	cont:;
253
	}
254
	fprint(2, "Unknown affix code \"%s\"\n", str);
255
	return 0;
256
out:
257
	for(i=0; i<ncodes; i++)
258
		if(encodes[i] == code)
259
			return i;
260
	encodes[i] = code;
261
	ncodes++;
262
	return i;
263
}
264
 
265
void
266
sput(int s)
267
{
268
 
269
	Bputc(&bout, s>>8);
270
	Bputc(&bout, s);
271
}
272
 
273
void
274
lput(long l)
275
{
276
	Bputc(&bout, l>>24);
277
	Bputc(&bout, l>>16);
278
	Bputc(&bout, l>>8);
279
	Bputc(&bout, l);
280
}
281
 
282
/*
283
 * spit out the encoded dictionary
284
 * all numbers are encoded big-endian.
285
 *	struct
286
 *	{
287
 *		short	ncodes;
288
 *		long	encodes[ncodes];
289
 *		struct
290
 *		{
291
 *			short	encode;
292
 *			char	word[*];
293
 *		} words[*];
294
 *	};
295
 * 0x8000 flag for code word
296
 * 0x7800 count of number of common bytes with previous word
297
 * 0x07ff index into codes array for affixes
298
 */
299
void
300
pdict(void)
301
{
302
	long i, count;
303
	int encode, j, c;
304
	char *lastword, *thisword, *word;
305
 
306
	sput(ncodes);
307
	for(i=0; i<ncodes; i++)
308
		lput(encodes[i]);
309
 
310
	count = ncodes*4 + 2;
311
	lastword = "";
312
	for(i=0; i<nwords; i++) {
313
		word = words[i].word;
314
		thisword = word;
315
		for(j=0; *thisword == *lastword; j++) {
316
			if(*thisword == 0) {
317
				fprint(2, "identical words: %s\n", word);
318
				break;
319
			}
320
			thisword++;
321
			lastword++;
322
		}
323
		if(j > 15)
324
			j = 15;
325
		encode = words[i].encode;
326
		c = (1<<15) | (j<<11) | encode;
327
		sput(c);
328
		count += 2;
329
		for(thisword=word+j; c = *thisword; thisword++) {
330
			Bputc(&bout, c);
331
			count++;
332
		}
333
		lastword = word;
334
	}
335
	fprint(2, "output bytes = %ld\n", count);
336
}