Subversion Repositories planix.SVN

Rev

Rev 2 | Blame | Compare with Previous | Last modification | View Log | RSS feed

#ifdef PLAN9
#include        <u.h>
#include        <libc.h>
#include        <bio.h>
#else
#include        <sys/types.h>
#include        <stdio.h>
#include        <stdlib.h>
#include        <string.h>
#include        <unistd.h>
#include        <errno.h>
#include        "plan9.h"
#endif
#include        "hdr.h"

/*
        the our_* routines are implementations for the corresponding library
        routines. for a while, i tried to actually name them wctomb etc
        but stopped that after i found a system which made wchar_t an
        unsigned char.
*/

int our_wctomb(char *s, unsigned long wc);
int our_mbtowc(unsigned long *p, char *s, unsigned n);
int runetoisoutf(char *str, Rune *rune);
int fullisorune(char *str, int n);
int isochartorune(Rune *rune, char *str);

void
utf_in(int fd, long *notused, struct convert *out)
{
        char buf[N];
        int i, j, c, n, tot;
        ulong l;

        USED(notused);
        tot = 0;
        while((n = read(fd, buf+tot, N-tot)) >= 0){
                tot += n;
                for(i=j=0; i<=tot-UTFmax || (i<tot && (n==0 || fullrune(buf+i, tot-i))); ){
                        c = our_mbtowc(&l, buf+i, tot-i);
                        if(c == -1){
                                if(squawk)
                                        EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
                                if(clean){
                                        i++;
                                        continue;
                                }
                                nerrors++;
                                l = Runeerror;
                                c = 1;
                        }
                        runes[j++] = l;
                        i += c;
                }
                OUT(out, runes, j);
                tot -= i;
                ninput += i;
                if(tot)
                        memmove(buf, buf+i, tot);
                if(n == 0)
                        break;
        }
        OUT(out, runes, 0);
}

void
utf_out(Rune *base, int n, long *notused)
{
        char *p;
        Rune *r;

        USED(notused);
        nrunes += n;
        for(r = base, p = obuf; n-- > 0; r++){
                p += our_wctomb(p, *r);
        }
        noutput += p-obuf;
        write(1, obuf, p-obuf);
}

void
isoutf_in(int fd, long *notused, struct convert *out)
{
        char buf[N];
        int i, j, c, n, tot;

        USED(notused);
        tot = 0;
        while((n = read(fd, buf+tot, N-tot)) >= 0){
                tot += n;
                for(i=j=0; i<tot; ){
                        if(!fullisorune(buf+i, tot-i))
                                break;
                        c = isochartorune(&runes[j], buf+i);
                        if(runes[j] == Runeerror && c == 1){
                                if(squawk)
                                        EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
                                if(clean){
                                        i++;
                                        continue;
                                }
                                nerrors++;
                        }
                        j++;
                        i += c;
                }
                OUT(out, runes, j);
                tot -= i;
                ninput += i;
                if(tot)
                        memmove(buf, buf+i, tot);
                if(n == 0)
                        break;
        }
        OUT(out, runes, 0);
}

void
isoutf_out(Rune *base, int n, long *notused)
{
        char *p;
        Rune *r;

        USED(notused);
        nrunes += n;
        for(r = base, p = obuf; n-- > 0; r++)
                p += runetoisoutf(p, r);
        noutput += p-obuf;
        write(1, obuf, p-obuf);
}


enum
{
        Char1   = Runeself,     Rune1   = Runeself,
        Char21  = 0xA1,         Rune21  = 0x0100,
        Char22  = 0xF6,         Rune22  = 0x4016,
        Char3   = 0xFC,         Rune3   = 0x10000,      /* really 0x38E2E */
        Esc     = 0xBE,         Bad     = Runeerror
};

static  uchar   U[256];
static  uchar   T[256];

static
void
mktable(void)
{
        int i, u;

        for(i=0; i<256; i++) {
                u = i + (0x5E - 0xA0);
                if(i < 0xA0)
                        u = i + (0xDF - 0x7F);
                if(i < 0x7F)
                        u = i + (0x00 - 0x21);
                if(i < 0x21)
                        u = i + (0xBE - 0x00);
                U[i] = u;
                T[u] = i;
        }
}

int
isochartorune(Rune *rune, char *str)
{
        int c, c1, c2;
        long l;

        if(U[0] == 0)
                mktable();

        /*
         * one character sequence
         *      00000-0009F => 00-9F
         */
        c = *(uchar*)str;
        if(c < Char1) {
                *rune = c;
                return 1;
        }

        /*
         * two character sequence
         *      000A0-000FF => A0; A0-FF
         */
        c1 = *(uchar*)(str+1);
        if(c < Char21) {
                if(c1 >= Rune1 && c1 < Rune21) {
                        *rune = c1;
                        return 2;
                }
                goto bad;
        }

        /*
         * two character sequence
         *      00100-04015 => A1-F5; 21-7E/A0-FF
         */
        c1 = U[c1];
        if(c1 >= Esc)
                goto bad;
        if(c < Char22) {
                *rune =  (c-Char21)*Esc + c1 + Rune21;
                return 2;
        }

        /*
         * three character sequence
         *      04016-38E2D => A6-FB; 21-7E/A0-FF
         */
        c2 = U[*(uchar*)(str+2)];
        if(c2 >= Esc)
                goto bad;
        if(c < Char3) {
                l = (c-Char22)*Esc*Esc + c1*Esc + c2 + Rune22;
                if(l >= Rune3)
                        goto bad;
                *rune = l;
                return 3;
        }

        /*
         * bad decoding
         */
bad:
        *rune = Bad;
        return 1;
}

int
runetoisoutf(char *str, Rune *rune)
{
        long c;

        if(T[0] == 0)
                mktable();

        /*
         * one character sequence
         *      00000-0009F => 00-9F
         */
        c = *rune;
        if(c < Rune1) {
                str[0] = c;
                return 1;
        }

        /*
         * two character sequence
         *      000A0-000FF => A0; A0-FF
         */
        if(c < Rune21) {
                str[0] = Char1;
                str[1] = c;
                return 2;
        }

        /*
         * two character sequence
         *      00100-04015 => A1-F5; 21-7E/A0-FF
         */
        if(c < Rune22) {
                c -= Rune21;
                str[0] = c/Esc + Char21;
                str[1] = T[c%Esc];
                return 2;
        }

        /*
         * three character sequence
         *      04016-38E2D => A6-FB; 21-7E/A0-FF
         */
        c -= Rune22;
        str[0] = c/(Esc*Esc) + Char22;
        str[1] = T[c/Esc%Esc];
        str[2] = T[c%Esc];
        return 3;
}

int
fullisorune(char *str, int n)
{
        int c;

        if(n > 0) {
                c = *(uchar*)str;
                if(c < Char1)
                        return 1;
                if(n > 1)
                        if(c < Char22 || n > 2)
                                return 1;
        }
        return 0;
}

#ifdef PLAN9
int     errno;
#endif

enum
{
        T1      = 0x00,
        Tx      = 0x80,
        T2      = 0xC0,
        T3      = 0xE0,
        T4      = 0xF0,
        T5      = 0xF8,
        T6      = 0xFC,

        Bit1    = 7,
        Bitx    = 6,
        Bit2    = 5,
        Bit3    = 4,
        Bit4    = 3,
        Bit5    = 2,
        Bit6    = 2,

        Mask1   = (1<<Bit1)-1,
        Maskx   = (1<<Bitx)-1,
        Mask2   = (1<<Bit2)-1,
        Mask3   = (1<<Bit3)-1,
        Mask4   = (1<<Bit4)-1,
        Mask5   = (1<<Bit5)-1,
        Mask6   = (1<<Bit6)-1,

        Wchar1  = (1UL<<Bit1)-1,
        Wchar2  = (1UL<<(Bit2+Bitx))-1,
        Wchar3  = (1UL<<(Bit3+2*Bitx))-1,
        Wchar4  = (1UL<<(Bit4+3*Bitx))-1,
        Wchar5  = (1UL<<(Bit5+4*Bitx))-1,

#ifndef EILSEQ
        EILSEQ  = 123,
#endif /* EILSEQ */
};

int
our_wctomb(char *s, unsigned long wc)
{
        if(s == 0)
                return 0;               /* no shift states */
        if(wc & ~Wchar2) {
                if(wc & ~Wchar4) {
                        if(wc & ~Wchar5) {
                                /* 6 bytes */
                                s[0] = T6 | ((wc >> 5*Bitx) & Mask6);
                                s[1] = Tx | ((wc >> 4*Bitx) & Maskx);
                                s[2] = Tx | ((wc >> 3*Bitx) & Maskx);
                                s[3] = Tx | ((wc >> 2*Bitx) & Maskx);
                                s[4] = Tx | ((wc >> 1*Bitx) & Maskx);
                                s[5] = Tx |  (wc & Maskx);
                                return 6;
                        }
                        /* 5 bytes */
                        s[0] = T5 |  (wc >> 4*Bitx);
                        s[1] = Tx | ((wc >> 3*Bitx) & Maskx);
                        s[2] = Tx | ((wc >> 2*Bitx) & Maskx);
                        s[3] = Tx | ((wc >> 1*Bitx) & Maskx);
                        s[4] = Tx |  (wc & Maskx);
                        return 5;
                }
                if(wc & ~Wchar3) {
                        /* 4 bytes */
                        s[0] = T4 |  (wc >> 3*Bitx);
                        s[1] = Tx | ((wc >> 2*Bitx) & Maskx);
                        s[2] = Tx | ((wc >> 1*Bitx) & Maskx);
                        s[3] = Tx |  (wc & Maskx);
                        return 4;
                }
                /* 3 bytes */
                s[0] = T3 |  (wc >> 2*Bitx);
                s[1] = Tx | ((wc >> 1*Bitx) & Maskx);
                s[2] = Tx |  (wc & Maskx);
                return 3;
        }
        if(wc & ~Wchar1) {
                /* 2 bytes */
                s[0] = T2 | (wc >> 1*Bitx);
                s[1] = Tx | (wc & Maskx);
                return 2;
        }
        /* 1 byte */
        s[0] = T1 | wc;
        return 1;
}

int
our_mbtowc(unsigned long *p, char *s, unsigned n)
{
        uchar *us;
        int c0, c1, c2, c3, c4, c5;
        unsigned long wc;

        if(s == 0)
                return 0;               /* no shift states */

        if(n < 1)
                goto bad;
        us = (uchar*)s;
        c0 = us[0];
        if(c0 >= T3) {
                if(n < 3)
                        goto bad;
                c1 = us[1] ^ Tx;
                c2 = us[2] ^ Tx;
                if((c1|c2) & T2)
                        goto bad;
                if(c0 >= T5) {
                        if(n < 5)
                                goto bad;
                        c3 = us[3] ^ Tx;
                        c4 = us[4] ^ Tx;
                        if((c3|c4) & T2)
                                goto bad;
                        if(c0 >= T6) {
                                /* 6 bytes */
                                if(n < 6)
                                        goto bad;
                                c5 = us[5] ^ Tx;
                                if(c5 & T2)
                                        goto bad;
                                wc = ((((((((((c0 & Mask6) << Bitx) |
                                        c1) << Bitx) | c2) << Bitx) |
                                        c3) << Bitx) | c4) << Bitx) | c5;
                                if(wc <= Wchar5)
                                        goto bad;
                                *p = wc;
                                return 6;
                        }
                        /* 5 bytes */
                        wc = ((((((((c0 & Mask5) << Bitx) |
                                c1) << Bitx) | c2) << Bitx) |
                                c3) << Bitx) | c4;
                        if(wc <= Wchar4)
                                goto bad;
                        *p = wc;
                        return 5;
                }
                if(c0 >= T4) {
                        /* 4 bytes */
                        if(n < 4)
                                goto bad;
                        c3 = us[3] ^ Tx;
                        if(c3 & T2)
                                goto bad;
                        wc = ((((((c0 & Mask4) << Bitx) |
                                c1) << Bitx) | c2) << Bitx) |
                                c3;
                        if(wc <= Wchar3)
                                goto bad;
                        *p = wc;
                        return 4;
                }
                /* 3 bytes */
                wc = ((((c0 & Mask3) << Bitx) |
                        c1) << Bitx) | c2;
                if(wc <= Wchar2)
                        goto bad;
                *p = wc;
                return 3;
        }
        if(c0 >= T2) {
                /* 2 bytes */
                if(n < 2)
                        goto bad;
                c1 = us[1] ^ Tx;
                if(c1 & T2)
                        goto bad;
                wc = ((c0 & Mask2) << Bitx) |
                        c1;
                if(wc <= Wchar1)
                        goto bad;
                *p = wc;
                return 2;
        }
        /* 1 byte */
        if(c0 >= Tx)
                goto bad;
        *p = c0;
        return 1;

bad:
        errno = EILSEQ;
        return -1;
}