Subversion Repositories planix.SVN

Rev

Rev 2 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 - 1
#include "tdef.h"
2
#include "fns.h"
3
#include "ext.h"
4
#include <assert.h>
5
 
6
#define	HY_BIT	0200	/* stuff in here only works for 7-bit ascii */
7
			/* this value is used (as a literal) in suftab.c */
8
			/* to encode possible hyphenation points in suffixes. */
9
			/* it could be changed, by widening the tables */
10
			/* to be shorts instead of chars. */
11
 
12
/*
13
 * troff8.c
14
 *
15
 * hyphenation
16
 */
17
 
18
int	hexsize = 0;		/* hyphenation exception list size */
19
char	*hbufp = NULL;		/* base of list */
20
char	*nexth = NULL;		/* first free slot in list */
21
Tchar	*hyend;
22
 
23
#define THRESH 160 		/* digram goodness threshold */
24
int	thresh = THRESH;
25
 
26
int	texhyphen(void);
27
static	int	alpha(Tchar);
28
 
29
void hyphen(Tchar *wp)
30
{
31
	int j;
32
	Tchar *i;
33
 
34
	i = wp;
35
	while (punct((*i++)))
36
		;
37
	if (!alpha(*--i))
38
		return;
39
	wdstart = i++;
40
	while (alpha(*i++))
41
		;
42
	hyend = wdend = --i - 1;
43
	while (punct((*i++)))
44
		;
45
	if (*--i)
46
		return;
47
	if (wdend - wdstart < 4)	/* 4 chars is too short to hyphenate */
48
		return;
49
	hyp = hyptr;
50
	*hyp = 0;
51
	hyoff = 2;
52
 
53
	/* for now, try exceptions first, then tex (if hyphalg is non-zero),
54
	   then suffix and digram if tex didn't hyphenate it at all.
55
	*/
56
 
57
	if (!exword() && !texhyphen() && !suffix())
58
		digram();
59
 
60
	/* this appears to sort hyphenation points into increasing order */
61
	*hyp++ = 0;
62
	if (*hyptr)
63
		for (j = 1; j; ) {
64
			j = 0;
65
			for (hyp = hyptr + 1; *hyp != 0; hyp++) {
66
				if (*(hyp - 1) > *hyp) {
67
					j++;
68
					i = *hyp;
69
					*hyp = *(hyp - 1);
70
					*(hyp - 1) = i;
71
				}
72
			}
73
		}
74
}
75
 
76
static alpha(Tchar i)	/* non-zero if really alphabetic */
77
{
78
	if (ismot(i))
79
		return 0;
80
	else if (cbits(i) >= ALPHABET)	/* this isn't very elegant, but there's */
81
		return 0;		/* no good way to make sure i is in range for */
82
	else				/* the call of isalpha */
83
		return isalpha(cbits(i));
84
}
85
 
86
 
87
punct(Tchar i)
88
{
89
	if (!i || alpha(i))
90
		return(0);
91
	else
92
		return(1);
93
}
94
 
95
 
96
void caseha(void)	/* set hyphenation algorithm */
97
{
98
	hyphalg = HYPHALG;
99
	if (skip())
100
		return;
101
	noscale++;
102
	hyphalg = atoi0();
103
	noscale = 0;
104
}
105
 
106
 
107
void caseht(void)	/* set hyphenation threshold;  not in manual! */
108
{
109
	thresh = THRESH;
110
	if (skip())
111
		return;
112
	noscale++;
113
	thresh = atoi0();
114
	noscale = 0;
115
}
116
 
117
 
118
char *growh(char *where)
119
{
120
	char *new;
121
 
122
	hexsize += NHEX;
123
	if ((new = grow(hbufp, hexsize, sizeof(char))) == NULL)
124
		return NULL;
125
	if (new == hbufp) {
126
		return where;
127
	} else {
128
		int diff;
129
		diff = where - hbufp;
130
		hbufp = new;
131
		return new + diff;
132
	}
133
}
134
 
135
 
136
void casehw(void)
137
{
138
	int i, k;
139
	char *j;
140
	Tchar t;
141
 
142
	if (nexth == NULL) {
143
		if ((nexth = hbufp = grow(hbufp, NHEX, sizeof(char))) == NULL) {
144
			ERROR "No space for exception word list." WARN;
145
			return;
146
		}
147
		hexsize = NHEX;
148
	}
149
	k = 0;
150
	while (!skip()) {
151
		if ((j = nexth) >= hbufp + hexsize - 2)
152
			if ((j = nexth = growh(j)) == NULL)
153
				goto full;
154
		for (;;) {
155
			if (ismot(t = getch()))
156
				continue;
157
			i = cbits(t);
158
			if (i == ' ' || i == '\n') {
159
				*j++ = 0;
160
				nexth = j;
161
				*j = 0;
162
				if (i == ' ')
163
					break;
164
				else
165
					return;
166
			}
167
			if (i == '-') {
168
				k = HY_BIT;
169
				continue;
170
			}
171
			*j++ = maplow(i) | k;
172
			k = 0;
173
			if (j >= hbufp + hexsize - 2)
174
				if ((j = growh(j)) == NULL)
175
					goto full;
176
		}
177
	}
178
	return;
179
full:
180
	ERROR "Cannot grow exception word list." WARN;
181
	*nexth = 0;
182
}
183
 
184
 
185
int exword(void)
186
{
187
	Tchar *w;
188
	char *e, *save;
189
 
190
	e = hbufp;
191
	while (1) {
192
		save = e;
193
		if (e == NULL || *e == 0)
194
			return(0);
195
		w = wdstart;
196
		while (*e && w <= hyend && (*e & 0177) == maplow(cbits(*w))) {
197
			e++;
198
			w++;
199
		}
200
		if (!*e) {
201
			if (w-1 == hyend || (w == wdend && maplow(cbits(*w)) == 's')) {
202
				w = wdstart;
203
				for (e = save; *e; e++) {
204
					if (*e & HY_BIT)
205
						*hyp++ = w;
206
					if (hyp > hyptr + NHYP - 1)
207
						hyp = hyptr + NHYP - 1;
208
					w++;
209
				}
210
				return(1);
211
			} else {
212
				e++;
213
				continue;
214
			}
215
		} else
216
			while (*e++)
217
				;
218
	}
219
}
220
 
221
 
222
suffix(void)
223
{
224
	Tchar *w;
225
	char *s, *s0;
226
	Tchar i;
227
	extern char *suftab[];
228
 
229
again:
230
	i = cbits(*hyend);
231
	if (!alpha(i))
232
		return(0);
233
	if (i < 'a')
234
		i -= 'A' - 'a';
235
	if ((s0 = suftab[i-'a']) == 0)
236
		return(0);
237
	for (;;) {
238
		if ((i = *s0 & 017) == 0)
239
			return(0);
240
		s = s0 + i - 1;
241
		w = hyend - 1;
242
		while (s > s0 && w >= wdstart && (*s & 0177) == maplow(cbits(*w))) {
243
			s--;
244
			w--;
245
		}
246
		if (s == s0)
247
			break;
248
		s0 += i;
249
	}
250
	s = s0 + i - 1;
251
	w = hyend;
252
	if (*s0 & HY_BIT)
253
		goto mark;
254
	while (s > s0) {
255
		w--;
256
		if (*s-- & HY_BIT) {
257
mark:
258
			hyend = w - 1;
259
			if (*s0 & 0100)	/* 0100 used in suftab to encode something too */
260
				continue;
261
			if (!chkvow(w))
262
				return(0);
263
			*hyp++ = w;
264
		}
265
	}
266
	if (*s0 & 040)
267
		return(0);
268
	if (exword())
269
		return(1);
270
	goto again;
271
}
272
 
273
 
274
maplow(int i)
275
{
276
	if (isupper(i))
277
		i = tolower(i);
278
	return(i);
279
}
280
 
281
 
282
vowel(int i)
283
{
284
	switch (i) {
285
	case 'a': case 'A':
286
	case 'e': case 'E':
287
	case 'i': case 'I':
288
	case 'o': case 'O':
289
	case 'u': case 'U':
290
	case 'y': case 'Y':
291
		return(1);
292
	default:
293
		return(0);
294
	}
295
}
296
 
297
 
298
Tchar *chkvow(Tchar *w)
299
{
300
	while (--w >= wdstart)
301
		if (vowel(cbits(*w)))
302
			return(w);
303
	return(0);
304
}
305
 
306
 
307
void digram(void)
308
{
309
	int maxval, val;
310
	Tchar *nhyend, *maxw, *w;
311
	extern char bxh[26][13], bxxh[26][13], xxh[26][13], xhx[26][13], hxx[26][13];
312
 
313
	maxw = 0;
314
again:
315
	if (!(w = chkvow(hyend + 1)))
316
		return;
317
	hyend = w;
318
	if (!(w = chkvow(hyend)))
319
		return;
320
	nhyend = w;
321
	maxval = 0;
322
	w--;
323
	while (++w < hyend && w < wdend - 1) {
324
		val = 1;
325
		if (w == wdstart)
326
			val *= dilook('a', cbits(*w), bxh);
327
		else if (w == wdstart + 1)
328
			val *= dilook(cbits(*(w-1)), cbits(*w), bxxh);
329
		else
330
			val *= dilook(cbits(*(w-1)), cbits(*w), xxh);
331
		val *= dilook(cbits(*w), cbits(*(w+1)), xhx);
332
		val *= dilook(cbits(*(w+1)), cbits(*(w+2)), hxx);
333
		if (val > maxval) {
334
			maxval = val;
335
			maxw = w + 1;
336
		}
337
	}
338
	hyend = nhyend;
339
	if (maxval > thresh)
340
		*hyp++ = maxw;
341
	goto again;
342
}
343
 
344
 
345
dilook(int a, int b, char t[26][13])
346
{
347
	int i, j;
348
 
349
	i = t[maplow(a)-'a'][(j = maplow(b)-'a')/2];
350
	if (!(j & 01))
351
		i >>= 4;
352
	return(i & 017);
353
}
354
 
355
 
356
/* here beginneth the tex hyphenation code, as interpreted freely */
357
/* the main difference is that there is no attempt to squeeze space */
358
/* as tightly at tex does. */
359
 
360
static int	texit(Tchar *, Tchar *);
361
static int	readpats(void);
362
static void	install(char *);
363
static void	fixup(void);
364
static int	trieindex(int, int);
365
 
366
static char	pats[50000];	/* size ought to be computed dynamically */
367
static char	*nextpat = pats;
368
static char	*trie[27*27];	/* english-specific sizes */
369
 
370
int texhyphen(void)
371
{
372
	static int loaded = 0;		/* -1: couldn't find tex file */
373
 
374
	if (hyphalg == 0 || loaded == -1)	/* non-zero => tex for now */
375
		return 0;
376
	if (loaded == 0) {
377
		if (readpats())
378
			loaded = 1;
379
		else
380
			loaded = -1;
381
	}
382
	return texit(wdstart, wdend);
383
}
384
 
385
static int texit(Tchar *start, Tchar *end)	/* hyphenate as in tex, return # found */
386
{
387
	int nw, i, k, equal, cnt[500];
388
	char w[500+1], *np, *pp, *wp, *xpp, *xwp;
389
 
390
	w[0] = '.';
391
	for (nw = 1; start <= end && nw < 500-1; nw++, start++)
392
		w[nw] = maplow(tolower(cbits(*start)));
393
	start -= (nw - 1);
394
	w[nw++] = '.';
395
	w[nw] = 0;
396
/*
397
 * printf("try %s\n", w);
398
*/
399
	for (i = 0; i <= nw; i++)
400
		cnt[i] = '0';
401
 
402
	for (wp = w; wp+1 < w+nw; wp++) {
403
		for (pp = trie[trieindex(*wp, *(wp+1))]; pp < nextpat; ) {
404
			if (pp == 0		/* no trie entry */
405
			 || *pp != *wp		/* no match on 1st letter */
406
			 || *(pp+1) != *(wp+1))	/* no match on 2nd letter */
407
				break;		/*   so move to next letter of word */
408
			equal = 1;
409
			for (xpp = pp+2, xwp = wp+2; *xpp; )
410
				if (*xpp++ != *xwp++) {
411
					equal = 0;
412
					break;
413
				}
414
			if (equal) {
415
				np = xpp+1;	/* numpat */
416
				for (k = wp-w; *np; k++, np++)
417
					if (*np > cnt[k])
418
						cnt[k] = *np;
419
/*
420
 * printf("match: %s  %s\n", pp, xpp+1);
421
*/
422
			}
423
			pp += *(pp-1);	/* skip over pattern and numbers to next */
424
		}
425
	}
426
/*
427
 * for (i = 0; i < nw; i++) printf("%c", w[i]);
428
 * printf("  ");
429
 * for (i = 0; i <= nw; i++) printf("%c", cnt[i]);
430
 * printf("\n");
431
*/
432
/*
433
 * 	for (i = 1; i < nw - 1; i++) {
434
 * 		if (i > 2 && i < nw - 3 && cnt[i] % 2)
435
 * 			printf("-");
436
 * 		if (cbits(start[i-1]) != '.')
437
 * 			printf("%c", cbits(start[i-1]));
438
 * 	}
439
 * 	printf("\n");
440
*/
441
	for (i = 1; i < nw -1; i++)
442
		if (i > 2 && i < nw - 3 && cnt[i] % 2)
443
			*hyp++ = start + i - 1;
444
	return hyp - hyptr;	/* non-zero if a hyphen was found */
445
}
446
 
447
/*
448
	This code assumes that hyphen.tex looks like
449
		% some comments
450
		\patterns{ % more comments
451
		pat5ter4ns, 1 per line, SORTED, nothing else
452
		}
453
		more goo
454
		\hyphenation{ % more comments
455
		ex-cep-tions, one per line; i ignore this part for now
456
		}
457
 
458
	this code is NOT robust against variations.  unfortunately,
459
	it looks like every local language version of this file has
460
	a different format.  i have also made no provision for weird
461
	characters.  sigh.
462
*/
463
 
464
static int readpats(void)
465
{
466
	FILE *fp;
467
	char buf[200], buf1[200];
468
 
469
	if ((fp = fopen(TEXHYPHENS, "r")) == NULL
470
	 && (fp = fopen(DWBalthyphens, "r")) == NULL) {
471
		ERROR "warning: can't find hyphen.tex" WARN;
472
		return 0;
473
	}
474
 
475
	while (fgets(buf, sizeof buf, fp) != NULL) {
476
		sscanf(buf, "%s", buf1);
477
		if (strcmp(buf1, "\\patterns{") == 0)
478
			break;
479
	}
480
	while (fgets(buf, sizeof buf, fp) != NULL) {
481
		if (buf[0] == '}')
482
			break;
483
		install(buf);
484
	}
485
	fclose(fp);
486
	fixup();
487
	return 1;
488
}
489
 
490
static void install(char *s)	/* map ab4c5de to: 12 abcde \0 00405 \0 */
491
{
492
	int npat, lastpat;
493
	char num[500], *onextpat = nextpat;
494
 
495
	num[0] = '0';
496
	*nextpat++ = ' ';	/* fill in with count later */
497
	for (npat = lastpat = 0; *s != '\n' && *s != '\0'; s++) {
498
		if (isdigit(*s)) {
499
			num[npat] = *s;
500
			lastpat = npat;
501
		} else {
502
			*nextpat++ = *s;
503
			npat++;
504
			num[npat] = '0';
505
		}
506
	}
507
	*nextpat++ = 0;
508
	if (nextpat > pats + sizeof(pats)-20) {
509
		ERROR "tex hyphenation table overflow, tail end ignored" WARN;
510
		nextpat = onextpat;
511
	}
512
	num[lastpat+1] = 0;
513
	strcat(nextpat, num);
514
	nextpat += strlen(nextpat) + 1;
515
}
516
 
517
static void fixup(void)	/* build indexes of where . a b c ... start */
518
{
519
	char *p, *lastc;
520
	int n;
521
 
522
	for (lastc = pats, p = pats+1; p < nextpat; p++)
523
		if (*p == ' ') {
524
			*lastc = p - lastc;
525
			lastc = p;
526
		}
527
	*lastc = p - lastc;
528
	for (p = pats+1; p < nextpat; ) {
529
		n = trieindex(p[0], p[1]);
530
		if (trie[n] == 0)
531
			trie[n] = p;
532
		p += p[-1];
533
	}
534
	/* printf("pats = %d\n", nextpat - pats); */
535
}
536
 
537
static int trieindex(int d1, int d2)
538
{
539
	int i;
540
 
541
	i = 27*(d1 == '.'? 0: d1 - 'a' + 1) + (d2 == '.'? 0: d2 - 'a' + 1);
542
	assert(0 <= i && i < 27*27);
543
	return i;
544
}