Subversion Repositories planix.SVN

Rev

Rev 2 | Blame | Compare with Previous | Last modification | View Log | RSS feed

#include <u.h>
#include <libc.h>
#include <bio.h>
#include "hdr.h"
#include "conv.h"

typedef struct Hchar Hchar;
struct Hchar
{
        char *s;
        Rune r;
};

/* &lt;, &gt;, &quot;, &amp; intentionally omitted */

/*
 * Names beginning with _ are names we recognize
 * (without the underscore) but will not generate,
 * because they are nonstandard.
 */
static Hchar byname[] =
{
        {"AElig", 198},
        {"Aacute", 193},
        {"Acirc", 194},
        {"Agrave", 192},
        {"Alpha", 913},
        {"Aring", 197},
        {"Atilde", 195},
        {"Auml", 196},
        {"Beta", 914},
        {"Ccedil", 199},
        {"Chi", 935},
        {"Dagger", 8225},
        {"Delta", 916},
        {"ETH", 208},
        {"Eacute", 201},
        {"Ecirc", 202},
        {"Egrave", 200},
        {"Epsilon", 917},
        {"Eta", 919},
        {"Euml", 203},
        {"Gamma", 915},
        {"Iacute", 205},
        {"Icirc", 206},
        {"Igrave", 204},
        {"Iota", 921},
        {"Iuml", 207},
        {"Kappa", 922},
        {"Lambda", 923},
        {"Mu", 924},
        {"Ntilde", 209},
        {"Nu", 925},
        {"OElig", 338},
        {"Oacute", 211},
        {"Ocirc", 212},
        {"Ograve", 210},
        {"Omega", 937},
        {"Omicron", 927},
        {"Oslash", 216},
        {"Otilde", 213},
        {"Ouml", 214},
        {"Phi", 934},
        {"Pi", 928},
        {"Prime", 8243},
        {"Psi", 936},
        {"Rho", 929},
        {"Scaron", 352},
        {"Sigma", 931},
        {"THORN", 222},
        {"Tau", 932},
        {"Theta", 920},
        {"Uacute", 218},
        {"Ucirc", 219},
        {"Ugrave", 217},
        {"Upsilon", 933},
        {"Uuml", 220},
        {"Xi", 926},
        {"Yacute", 221},
        {"Yuml", 376},
        {"Zeta", 918},
        {"aacute", 225},
        {"acirc", 226},
        {"acute", 180},
        {"aelig", 230},
        {"agrave", 224},
        {"alefsym", 8501},
        {"alpha", 945},
        {"amp", 38},
        {"and", 8743},
        {"ang", 8736},
        {"aring", 229},
        {"asymp", 8776},
        {"atilde", 227},
        {"auml", 228},
        {"bdquo", 8222},
        {"beta", 946},
        {"brvbar", 166},
        {"bull", 8226},
        {"cap", 8745},
        {"ccedil", 231},
        {"cdots", 8943},
        {"cedil", 184},
        {"cent", 162},
        {"chi", 967},
        {"circ", 710},
        {"clubs", 9827},
        {"cong", 8773},
        {"copy", 169},
        {"crarr", 8629},
        {"cup", 8746},
        {"curren", 164},
        {"dArr", 8659},
        {"dagger", 8224},
        {"darr", 8595},
        {"ddots", 8945},
        {"deg", 176},
        {"delta", 948},
        {"diams", 9830},
        {"divide", 247},
        {"eacute", 233},
        {"ecirc", 234},
        {"egrave", 232},
        {"_emdash", 8212},      /* non-standard but commonly used */
        {"empty", 8709},
        {"emsp", 8195},
        {"_endash", 8211},      /* non-standard but commonly used */
        {"ensp", 8194},
        {"epsilon", 949},
        {"equiv", 8801},
        {"eta", 951},
        {"eth", 240},
        {"euml", 235},
        {"euro", 8364},
        {"exist", 8707},
        {"fnof", 402},
        {"forall", 8704},
        {"frac12", 189},
        {"frac14", 188},
        {"frac34", 190},
        {"frasl", 8260},
        {"gamma", 947},
        {"ge", 8805},
        {"gt", 62},
        {"hArr", 8660},
        {"harr", 8596},
        {"hearts", 9829},
        {"hellip", 8230},
        {"iacute", 237},
        {"icirc", 238},
        {"iexcl", 161},
        {"igrave", 236},
        {"image", 8465},
        {"infin", 8734},
        {"int", 8747},
        {"iota", 953},
        {"iquest", 191},
        {"isin", 8712},
        {"iuml", 239},
        {"kappa", 954},
        {"lArr", 8656},
        {"lambda", 955},
        {"lang", 9001},
        {"laquo", 171},
        {"larr", 8592},
        {"lceil", 8968},
        {"_ldots", 8230},
        {"ldquo", 8220},
        {"le", 8804},
        {"lfloor", 8970},
        {"lowast", 8727},
        {"loz", 9674},
        {"lrm", 8206},
        {"lsaquo", 8249},
        {"lsquo", 8216},
        {"lt", 60},
        {"macr", 175},
        {"mdash", 8212},
        {"micro", 181},
        {"middot", 183},
        {"minus", 8722},
        {"mu", 956},
        {"nabla", 8711},
        {"nbsp", 160},
        {"ndash", 8211},
        {"ne", 8800},
        {"ni", 8715},
        {"not", 172},
        {"notin", 8713},
        {"nsub", 8836},
        {"ntilde", 241},
        {"nu", 957},
        {"oacute", 243},
        {"ocirc", 244},
        {"oelig", 339},
        {"ograve", 242},
        {"oline", 8254},
        {"omega", 969},
        {"omicron", 959},
        {"oplus", 8853},
        {"or", 8744},
        {"ordf", 170},
        {"ordm", 186},
        {"oslash", 248},
        {"otilde", 245},
        {"otimes", 8855},
        {"ouml", 246},
        {"para", 182},
        {"part", 8706},
        {"permil", 8240},
        {"perp", 8869},
        {"phi", 966},
        {"pi", 960},
        {"piv", 982},
        {"plusmn", 177},
        {"pound", 163},
        {"prime", 8242},
        {"prod", 8719},
        {"prop", 8733},
        {"psi", 968},
        {"quad", 8193},
        {"quot", 34},
        {"rArr", 8658},
        {"radic", 8730},
        {"rang", 9002},
        {"raquo", 187},
        {"rarr", 8594},
        {"rceil", 8969},
        {"rdquo", 8221},
        {"real", 8476},
        {"reg", 174},
        {"rfloor", 8971},
        {"rho", 961},
        {"rlm", 8207},
        {"rsaquo", 8250},
        {"rsquo", 8217},
        {"sbquo", 8218},
        {"scaron", 353},
        {"sdot", 8901},
        {"sect", 167},
        {"shy", 173},
        {"sigma", 963},
        {"sigmaf", 962},
        {"sim", 8764},
        {"_sp", 8194},
        {"spades", 9824},
        {"sub", 8834},
        {"sube", 8838},
        {"sum", 8721},
        {"sup", 8835},
        {"sup1", 185},
        {"sup2", 178},
        {"sup3", 179},
        {"supe", 8839},
        {"szlig", 223},
        {"tau", 964},
        {"there4", 8756},
        {"theta", 952},
        {"thetasym", 977},
        {"thinsp", 8201},
        {"thorn", 254},
        {"tilde", 732},
        {"times", 215},
        {"trade", 8482},
        {"uArr", 8657},
        {"uacute", 250},
        {"uarr", 8593},
        {"ucirc", 251},
        {"ugrave", 249},
        {"uml", 168},
        {"upsih", 978},
        {"upsilon", 965},
        {"uuml", 252},
        {"_varepsilon", 8712},
        {"varphi", 981},
        {"_varpi", 982},
        {"varrho", 1009},
        {"vdots", 8942},
        {"_vsigma", 962},
        {"_vtheta", 977},
        {"weierp", 8472},
        {"xi", 958},
        {"yacute", 253},
        {"yen", 165},
        {"yuml", 255},
        {"zeta", 950},
        {"zwj", 8205},
        {"zwnj", 8204}
};

static Hchar byrune[nelem(byname)];

static int
hnamecmp(const void *va, const void *vb)
{
        Hchar *a, *b;
        
        a = (Hchar*)va;
        b = (Hchar*)vb;
        return strcmp(a->s, b->s);
}

static int
hrunecmp(const void *va, const void *vb)
{
        Hchar *a, *b;
        
        a = (Hchar*)va;
        b = (Hchar*)vb;
        return a->r - b->r;
}

static void
html_init(void)
{
        static int init;
        int i;
        
        if(init)
                return;
        init = 1;
        memmove(byrune, byname, sizeof byrune);
        
        /* Eliminate names we aren't allowed to generate. */
        for(i=0; i<nelem(byrune); i++){
                if(byrune[i].s[0] == '_'){
                        byrune[i].r = Runeerror;
                        byname[i].s++;
                }
        }
        
        qsort(byname, nelem(byname), sizeof byname[0], hnamecmp);
        qsort(byrune, nelem(byrune), sizeof byrune[0], hrunecmp);
}

static Rune
findbyname(char *s)
{
        Hchar *h;
        int n, m, x;
        
        h = byname;
        n = nelem(byname);
        while(n > 0){
                m = n/2;
                x = strcmp(h[m].s, s);
                if(x == 0)
                        return h[m].r;
                if(x < 0){
                        h += m+1;
                        n -= m+1;
                }else
                        n = m;
        }
        return Runeerror;
}

static char*
findbyrune(Rune r)
{
        Hchar *h;
        int n, m;

        if(r == Runeerror)
                return nil;
        h = byrune;
        n = nelem(byrune);
        while(n > 0){
                m = n/2;
                if(h[m].r == r)
                        return h[m].s;
                if(h[m].r < r){
                        h += m+1;
                        n -= m+1;
                }else
                        n = m;
        }
        return nil;
}

void
html_in(int fd, long *x, struct convert *out)
{
        char buf[100], *p;
        Biobuf b;
        Rune rbuf[N];
        Rune *r, *er;
        int c, i;
        
        USED(x);
        
        html_init();
        r = rbuf;
        er = rbuf+N;
        Binit(&b, fd, OREAD);
        while((c = Bgetrune(&b)) != Beof){
                if(r >= er){
                        OUT(out, rbuf, r-rbuf);
                        r = rbuf;
                }
                if(c == '&'){
                        buf[0] = c;
                        for(i=1; i<nelem(buf)-1;){
                                c = Bgetc(&b);
                                if(c == Beof)
                                        break;
                                buf[i++] = c;
                                if(strchr("; \t\r\n", c))
                                        break;
                        }
                        buf[i] = 0;
                        if(buf[i-1] == ';'){
                                buf[i-1] = 0;
                                if((c = findbyname(buf+1)) != Runeerror){
                                        *r++ = c;
                                        continue;
                                }
                                buf[i-1] = ';';
                                if(buf[1] == '#'){
                                        if(buf[2] == 'x')
                                                c = strtol(buf+3, &p, 16);
                                        else
                                                c = strtol(buf+2, &p, 10);
                                        if(*p != ';' || c >= NRUNE || c < 0)
                                                goto bad;
                                        *r++ = c;
                                        continue;
                                }
                        }
                bad:
                        for(p=buf; p<buf+i; ){
                                p += chartorune(r++, p);
                                if(r >= er){
                                        OUT(out, rbuf, r-rbuf);
                                        r = rbuf;
                                }
                        }
                        continue;
                }
                *r++ = c;
        }
        if(r > rbuf)
                OUT(out, rbuf, r-rbuf);
        OUT(out, rbuf, 0);
}

/*
 * use biobuf because can use more than UTFmax bytes per rune
 */
void
html_out(Rune *r, int n, long *x)
{
        char *s;
        Biobuf b;
        Rune *er;
        
        USED(x);
        html_init();
        Binit(&b, 1, OWRITE);
        er = r+n;
        for(; r<er; r++){
                if(*r < Runeself)
                        Bputrune(&b, *r);
                else if((s = findbyrune(*r)) != nil)
                        Bprint(&b, "&%s;", s);
                else
                        Bprint(&b, "&#%d;", *r);
        }
        Bflush(&b);
}