Subversion Repositories planix.SVN

Rev

Blame | Last modification | View Log | RSS feed

#include "tdef.h"
#include "fns.h"
#include "ext.h"
#include <assert.h>

#define HY_BIT  0200    /* stuff in here only works for 7-bit ascii */
                        /* this value is used (as a literal) in suftab.c */
                        /* to encode possible hyphenation points in suffixes. */
                        /* it could be changed, by widening the tables */
                        /* to be shorts instead of chars. */

/*
 * troff8.c
 *
 * hyphenation
 */

int     hexsize = 0;            /* hyphenation exception list size */
char    *hbufp = NULL;          /* base of list */
char    *nexth = NULL;          /* first free slot in list */
Tchar   *hyend;

#define THRESH 160              /* digram goodness threshold */
int     thresh = THRESH;

int     texhyphen(void);
static  int     alpha(Tchar);

void hyphen(Tchar *wp)
{
        int j;
        Tchar *i;

        i = wp;
        while (punct((*i++)))
                ;
        if (!alpha(*--i))
                return;
        wdstart = i++;
        while (alpha(*i++))
                ;
        hyend = wdend = --i - 1;
        while (punct((*i++)))
                ;
        if (*--i)
                return;
        if (wdend - wdstart < 4)        /* 4 chars is too short to hyphenate */
                return;
        hyp = hyptr;
        *hyp = 0;
        hyoff = 2;

        /* for now, try exceptions first, then tex (if hyphalg is non-zero),
           then suffix and digram if tex didn't hyphenate it at all.
        */

        if (!exword() && !texhyphen() && !suffix())
                digram();

        /* this appears to sort hyphenation points into increasing order */
        *hyp++ = 0;
        if (*hyptr)
                for (j = 1; j; ) {
                        j = 0;
                        for (hyp = hyptr + 1; *hyp != 0; hyp++) {
                                if (*(hyp - 1) > *hyp) {
                                        j++;
                                        i = *hyp;
                                        *hyp = *(hyp - 1);
                                        *(hyp - 1) = i;
                                }
                        }
                }
}

static alpha(Tchar i)   /* non-zero if really alphabetic */
{
        if (ismot(i))
                return 0;
        else if (cbits(i) >= ALPHABET)  /* this isn't very elegant, but there's */
                return 0;               /* no good way to make sure i is in range for */
        else                            /* the call of isalpha */
                return isalpha(cbits(i));
}


punct(Tchar i)
{
        if (!i || alpha(i))
                return(0);
        else
                return(1);
}


void caseha(void)       /* set hyphenation algorithm */
{
        hyphalg = HYPHALG;
        if (skip())
                return;
        noscale++;
        hyphalg = atoi0();
        noscale = 0;
}


void caseht(void)       /* set hyphenation threshold;  not in manual! */
{
        thresh = THRESH;
        if (skip())
                return;
        noscale++;
        thresh = atoi0();
        noscale = 0;
}


char *growh(char *where)
{
        char *new;

        hexsize += NHEX;
        if ((new = grow(hbufp, hexsize, sizeof(char))) == NULL)
                return NULL;
        if (new == hbufp) {
                return where;
        } else {
                int diff;
                diff = where - hbufp;
                hbufp = new;
                return new + diff;
        }
}


void casehw(void)
{
        int i, k;
        char *j;
        Tchar t;

        if (nexth == NULL) {
                if ((nexth = hbufp = grow(hbufp, NHEX, sizeof(char))) == NULL) {
                        ERROR "No space for exception word list." WARN;
                        return;
                }
                hexsize = NHEX;
        }
        k = 0;
        while (!skip()) {
                if ((j = nexth) >= hbufp + hexsize - 2)
                        if ((j = nexth = growh(j)) == NULL)
                                goto full;
                for (;;) {
                        if (ismot(t = getch()))
                                continue;
                        i = cbits(t);
                        if (i == ' ' || i == '\n') {
                                *j++ = 0;
                                nexth = j;
                                *j = 0;
                                if (i == ' ')
                                        break;
                                else
                                        return;
                        }
                        if (i == '-') {
                                k = HY_BIT;
                                continue;
                        }
                        *j++ = maplow(i) | k;
                        k = 0;
                        if (j >= hbufp + hexsize - 2)
                                if ((j = growh(j)) == NULL)
                                        goto full;
                }
        }
        return;
full:
        ERROR "Cannot grow exception word list." WARN;
        *nexth = 0;
}


int exword(void)
{
        Tchar *w;
        char *e, *save;

        e = hbufp;
        while (1) {
                save = e;
                if (e == NULL || *e == 0)
                        return(0);
                w = wdstart;
                while (*e && w <= hyend && (*e & 0177) == maplow(cbits(*w))) {
                        e++;
                        w++;
                }
                if (!*e) {
                        if (w-1 == hyend || (w == wdend && maplow(cbits(*w)) == 's')) {
                                w = wdstart;
                                for (e = save; *e; e++) {
                                        if (*e & HY_BIT)
                                                *hyp++ = w;
                                        if (hyp > hyptr + NHYP - 1)
                                                hyp = hyptr + NHYP - 1;
                                        w++;
                                }
                                return(1);
                        } else {
                                e++;
                                continue;
                        }
                } else
                        while (*e++)
                                ;
        }
}


suffix(void)
{
        Tchar *w;
        char *s, *s0;
        Tchar i;
        extern char *suftab[];

again:
        i = cbits(*hyend);
        if (!alpha(i))
                return(0);
        if (i < 'a')
                i -= 'A' - 'a';
        if ((s0 = suftab[i-'a']) == 0)
                return(0);
        for (;;) {
                if ((i = *s0 & 017) == 0)
                        return(0);
                s = s0 + i - 1;
                w = hyend - 1;
                while (s > s0 && w >= wdstart && (*s & 0177) == maplow(cbits(*w))) {
                        s--;
                        w--;
                }
                if (s == s0)
                        break;
                s0 += i;
        }
        s = s0 + i - 1;
        w = hyend;
        if (*s0 & HY_BIT)
                goto mark;
        while (s > s0) {
                w--;
                if (*s-- & HY_BIT) {
mark:
                        hyend = w - 1;
                        if (*s0 & 0100) /* 0100 used in suftab to encode something too */
                                continue;
                        if (!chkvow(w))
                                return(0);
                        *hyp++ = w;
                }
        }
        if (*s0 & 040)
                return(0);
        if (exword())
                return(1);
        goto again;
}


maplow(int i)
{
        if (isupper(i))
                i = tolower(i);
        return(i);
}


vowel(int i)
{
        switch (i) {
        case 'a': case 'A':
        case 'e': case 'E':
        case 'i': case 'I':
        case 'o': case 'O':
        case 'u': case 'U':
        case 'y': case 'Y':
                return(1);
        default:
                return(0);
        }
}


Tchar *chkvow(Tchar *w)
{
        while (--w >= wdstart)
                if (vowel(cbits(*w)))
                        return(w);
        return(0);
}


void digram(void)
{
        int maxval, val;
        Tchar *nhyend, *maxw, *w;
        extern char bxh[26][13], bxxh[26][13], xxh[26][13], xhx[26][13], hxx[26][13];

        maxw = 0;
again:
        if (!(w = chkvow(hyend + 1)))
                return;
        hyend = w;
        if (!(w = chkvow(hyend)))
                return;
        nhyend = w;
        maxval = 0;
        w--;
        while (++w < hyend && w < wdend - 1) {
                val = 1;
                if (w == wdstart)
                        val *= dilook('a', cbits(*w), bxh);
                else if (w == wdstart + 1)
                        val *= dilook(cbits(*(w-1)), cbits(*w), bxxh);
                else
                        val *= dilook(cbits(*(w-1)), cbits(*w), xxh);
                val *= dilook(cbits(*w), cbits(*(w+1)), xhx);
                val *= dilook(cbits(*(w+1)), cbits(*(w+2)), hxx);
                if (val > maxval) {
                        maxval = val;
                        maxw = w + 1;
                }
        }
        hyend = nhyend;
        if (maxval > thresh)
                *hyp++ = maxw;
        goto again;
}


dilook(int a, int b, char t[26][13])
{
        int i, j;

        i = t[maplow(a)-'a'][(j = maplow(b)-'a')/2];
        if (!(j & 01))
                i >>= 4;
        return(i & 017);
}


/* here beginneth the tex hyphenation code, as interpreted freely */
/* the main difference is that there is no attempt to squeeze space */
/* as tightly at tex does. */

static int      texit(Tchar *, Tchar *);
static int      readpats(void);
static void     install(char *);
static void     fixup(void);
static int      trieindex(int, int);

static char     pats[50000];    /* size ought to be computed dynamically */
static char     *nextpat = pats;
static char     *trie[27*27];   /* english-specific sizes */

int texhyphen(void)
{
        static int loaded = 0;          /* -1: couldn't find tex file */

        if (hyphalg == 0 || loaded == -1)       /* non-zero => tex for now */
                return 0;
        if (loaded == 0) {
                if (readpats())
                        loaded = 1;
                else
                        loaded = -1;
        }
        return texit(wdstart, wdend);
}

static int texit(Tchar *start, Tchar *end)      /* hyphenate as in tex, return # found */
{
        int nw, i, k, equal, cnt[500];
        char w[500+1], *np, *pp, *wp, *xpp, *xwp;

        w[0] = '.';
        for (nw = 1; start <= end && nw < 500-1; nw++, start++)
                w[nw] = maplow(tolower(cbits(*start)));
        start -= (nw - 1);
        w[nw++] = '.';
        w[nw] = 0;
/*
 * printf("try %s\n", w);
*/
        for (i = 0; i <= nw; i++)
                cnt[i] = '0';

        for (wp = w; wp+1 < w+nw; wp++) {
                for (pp = trie[trieindex(*wp, *(wp+1))]; pp < nextpat; ) {
                        if (pp == 0             /* no trie entry */
                         || *pp != *wp          /* no match on 1st letter */
                         || *(pp+1) != *(wp+1)) /* no match on 2nd letter */
                                break;          /*   so move to next letter of word */
                        equal = 1;
                        for (xpp = pp+2, xwp = wp+2; *xpp; )
                                if (*xpp++ != *xwp++) {
                                        equal = 0;
                                        break;
                                }
                        if (equal) {
                                np = xpp+1;     /* numpat */
                                for (k = wp-w; *np; k++, np++)
                                        if (*np > cnt[k])
                                                cnt[k] = *np;
/*
 * printf("match: %s  %s\n", pp, xpp+1);
*/
                        }
                        pp += *(pp-1);  /* skip over pattern and numbers to next */
                }
        }
/*
 * for (i = 0; i < nw; i++) printf("%c", w[i]);
 * printf("  ");
 * for (i = 0; i <= nw; i++) printf("%c", cnt[i]);
 * printf("\n");
*/
/*
 *      for (i = 1; i < nw - 1; i++) {
 *              if (i > 2 && i < nw - 3 && cnt[i] % 2)
 *                      printf("-");
 *              if (cbits(start[i-1]) != '.')
 *                      printf("%c", cbits(start[i-1]));
 *      }
 *      printf("\n");
*/
        for (i = 1; i < nw -1; i++)
                if (i > 2 && i < nw - 3 && cnt[i] % 2)
                        *hyp++ = start + i - 1;
        return hyp - hyptr;     /* non-zero if a hyphen was found */
}

/*
        This code assumes that hyphen.tex looks like
                % some comments
                \patterns{ % more comments
                pat5ter4ns, 1 per line, SORTED, nothing else
                }
                more goo
                \hyphenation{ % more comments
                ex-cep-tions, one per line; i ignore this part for now
                }

        this code is NOT robust against variations.  unfortunately,
        it looks like every local language version of this file has
        a different format.  i have also made no provision for weird
        characters.  sigh.
*/

static int readpats(void)
{
        FILE *fp;
        char buf[200], buf1[200];

        if ((fp = fopen(TEXHYPHENS, "r")) == NULL
         && (fp = fopen(DWBalthyphens, "r")) == NULL) {
                ERROR "warning: can't find hyphen.tex" WARN;
                return 0;
        }

        while (fgets(buf, sizeof buf, fp) != NULL) {
                sscanf(buf, "%s", buf1);
                if (strcmp(buf1, "\\patterns{") == 0)
                        break;
        }
        while (fgets(buf, sizeof buf, fp) != NULL) {
                if (buf[0] == '}')
                        break;
                install(buf);
        }
        fclose(fp);
        fixup();
        return 1;
}

static void install(char *s)    /* map ab4c5de to: 12 abcde \0 00405 \0 */
{
        int npat, lastpat;
        char num[500], *onextpat = nextpat;

        num[0] = '0';
        *nextpat++ = ' ';       /* fill in with count later */
        for (npat = lastpat = 0; *s != '\n' && *s != '\0'; s++) {
                if (isdigit(*s)) {
                        num[npat] = *s;
                        lastpat = npat;
                } else {
                        *nextpat++ = *s;
                        npat++;
                        num[npat] = '0';
                }
        }
        *nextpat++ = 0;
        if (nextpat > pats + sizeof(pats)-20) {
                ERROR "tex hyphenation table overflow, tail end ignored" WARN;
                nextpat = onextpat;
        }
        num[lastpat+1] = 0;
        strcat(nextpat, num);
        nextpat += strlen(nextpat) + 1;
}

static void fixup(void) /* build indexes of where . a b c ... start */
{
        char *p, *lastc;
        int n;

        for (lastc = pats, p = pats+1; p < nextpat; p++)
                if (*p == ' ') {
                        *lastc = p - lastc;
                        lastc = p;
                }
        *lastc = p - lastc;
        for (p = pats+1; p < nextpat; ) {
                n = trieindex(p[0], p[1]);
                if (trie[n] == 0)
                        trie[n] = p;
                p += p[-1];
        }
        /* printf("pats = %d\n", nextpat - pats); */
}

static int trieindex(int d1, int d2)
{
        int i;

        i = 27*(d1 == '.'? 0: d1 - 'a' + 1) + (d2 == '.'? 0: d2 - 'a' + 1);
        assert(0 <= i && i < 27*27);
        return i;
}