Blame | Last modification | View Log | RSS feed
/*
* Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002.
*
* Sccsid @(#)bracket.c 1.14 (gritter) 10/18/03
*/
/* UNIX(R) Regular Expresssion Library
*
* Note: Code is released under the GNU LGPL
*
* Copyright (C) 2001 Caldera International, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to:
* Free Software Foundation, Inc.
* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/* #include "synonyms.h" */
#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#include "re.h"
/*
* Build and match the [...] part of REs.
*
* In general, each compiled bracket construct holds a set of mapped
* wide character values and a set of character classifications.
* The mapping applied (when the current LC_COLLATE is not CHF_ENCODED)
* is the "basic" weight (cep->weight[0]); otherwise the actual wide
* character is used.
*
* To support simplified range handling, this code assumes that a w_type,
* a signed integer type, can hold all valid basic weight values (as well
* as all wide character values for CHF_ENCODED locales) and that these
* are all positive. Negative values indicate error conditions (BKT_*);
* zero (which must be the same as WGHT_IGNORE) indicates success, but
* that the item installed is not a range endpoint.
*/
static int
addwide(Bracket *bp, wchar_t ord)
{
unsigned int nw;
if ((nw = bp->nwide) < NWIDE)
bp->wide[nw] = ord;
else
{
if (nw % NWIDE == 0 && (bp->exwide =
realloc(bp->exwide, nw * sizeof(wchar_t))) == 0)
{
return BKT_ESPACE;
}
nw -= NWIDE;
bp->exwide[nw] = ord;
}
bp->nwide++;
return 0;
}
#if USHRT_MAX == 65535 /* have 16 bits */
#define PLIND(n) ((n) >> 4)
#define PLBIT(n) (1 << ((n) & 0xf))
#else
#define PLIND(n) ((n) / CHAR_BIT)
#define PLBIT(n) (1 << ((n) % CHAR_BIT))
#endif
#define RANGE ((wchar_t)'-') /* separates wide chars in ranges */
static int
addrange(Bracket *bp, wchar_t ord, w_type prev)
{
int ret;
if (prev > 0 && prev != ord) /* try for range */
{
if (prev > ord)
{
if (bp->flags & BKT_ODDRANGE) /* prev only - done */
return 0;
else if ((bp->flags & BKT_BADRANGE) == 0)
return BKT_ERANGE;
}
else
{
if (++prev <= UCHAR_MAX) /* "prev" already there */
{
do
{
bp->byte[PLIND(prev)] |= PLBIT(prev);
if (prev == ord)
return 0;
} while (++prev <= UCHAR_MAX);
}
if ((ret = addwide(bp, prev)) != 0)
return ret;
if (++prev > ord)
return 0;
if (prev < ord && (ret = addwide(bp, RANGE)) != 0)
return ret;
return addwide(bp, ord);
}
}
if (ord <= UCHAR_MAX)
{
bp->byte[PLIND(ord)] |= PLBIT(ord);
return 0;
}
if (prev == ord) /* don't bother */
return 0;
return addwide(bp, ord);
}
static w_type
place(Bracket *bp, wchar_t wc, w_type prev, int mb_cur_max)
{
const CollElem *cep;
CollElem spare;
int ret;
if ((cep = libuxre_collelem(bp->col, &spare, wc)) != ELEM_ENCODED)
{
if (cep == ELEM_BADCHAR)
return BKT_BADCHAR;
wc = cep->weight[0];
}
if ((ret = addrange(bp, wc, prev)) != 0)
return ret;
return wc;
}
#ifndef CHARCLASS_NAME_MAX
# define CHARCLASS_NAME_MAX 127
#endif
static w_type
chcls(Bracket *bp, const unsigned char *s, int n)
{
char clsstr[CHARCLASS_NAME_MAX + 1];
unsigned int nt;
wctype_t wct;
if (n > CHARCLASS_NAME_MAX)
return BKT_ECTYPE;
(void)memcpy(clsstr, s, n);
clsstr[n] = '\0';
if ((wct = wctype(clsstr)) == 0)
return BKT_ECTYPE;
if ((nt = bp->ntype) < NTYPE)
bp->type[nt] = wct;
else
{
if (nt % NTYPE == 0 && (bp->extype =
realloc(bp->extype, nt * sizeof(wctype_t))) == 0)
{
return BKT_ESPACE;
}
nt -= NTYPE;
bp->extype[nt] = wct;
}
bp->ntype++;
return 0; /* cannot be end point of a range */
}
/*
* The purpose of mcce() and its Mcce structure is to locate
* the next full collation element from "wc" and "s". It is
* called both at compile and execute time. These two differ
* primarily in that at compile time there is an exact number
* of bytes to be consumed, while at execute time the longest
* valid collation element is to be found.
*
* When BKT_ONECASE is set, MCCEs become particularly messy.
* There is no guarantee that all possible combinations of
* upper/lower case are defined as MCCEs. Thus, this code
* tries both lower- and uppercase (in that order) for each
* character than might be part of an MCCE.
*/
typedef struct
{
const unsigned char *max; /* restriction by caller */
const unsigned char *aft; /* longest successful */
Bracket *bp; /* readonly */
struct lc_collate *col; /* readonly */
const CollElem *cep; /* entry matching longest */
wchar_t ch; /* initial character (if any) */
w_type wc; /* character matching "aft" */
} Mcce;
static int
mcce(Mcce *mcp, const CollElem *cep, const unsigned char *s, int mb_cur_max,
int compile_time)
{
const CollElem *nxt;
CollElem spare;
w_type ch, wc;
int i;
/*
* Get next character.
*/
if ((wc = mcp->ch) != '\0')
{
mcp->ch = '\0';
}
else if (ISONEBYTE(wc = *s++))
{
if (wc == '\0')
return 0;
}
else if ((i = libuxre_mb2wc(&wc, s)) > 0)
{
s += i;
if (mcp->max != 0 && s > mcp->max)
return 0;
}
else if (i < 0)
return BKT_ILLSEQ;
/*
* Try out the this character as part of an MCCE.
* If BKT_ONECASE is set, this code tries both the lower- and
* uppercase version, continuing if it matches so far.
*/
ch = wc;
if (mcp->bp->flags & BKT_ONECASE)
{
if ((wc = to_lower(wc)) == ch)
ch = to_upper(wc);
}
for (;;) /* at most twice */
{
if (cep == ELEM_BADCHAR) /* first character */
{
if ((nxt = libuxre_collelem(mcp->col, &spare, wc))
== ELEM_ENCODED
|| (mcp->col->flags & CHF_MULTICH) == 0
|| s == mcp->max)
{
mcp->aft = s;
mcp->cep = nxt;
mcp->wc = wc;
break;
}
}
else
{
nxt = libuxre_collmult(mcp->col, cep, wc);
}
if (nxt != ELEM_BADCHAR)
{
/*
* Okay so far. Record this collating element
* if it's really one (not WGHT_IGNORE) and
* we've reached a new high point or it's the
* first match.
*
* If there's a possibility for more, call mcce()
* recursively for the subsequent characters.
*/
if (nxt->weight[0] != WGHT_IGNORE
&& (mcp->aft < s || mcp->cep == ELEM_BADCHAR))
{
mcp->aft = s;
mcp->cep = nxt;
mcp->wc = wc;
}
if (nxt->multbeg != 0
&& (mcp->max == 0 || s < mcp->max))
{
if ((i = mcce(mcp, nxt, s, mb_cur_max,
compile_time)) != 0)
return i;
}
}
if (wc == ch)
break;
wc = ch;
}
return 0;
}
static w_type
eqcls(Bracket *bp, const unsigned char *s, int n, w_type prev, int mb_cur_max)
{
w_type last;
Mcce mcbuf;
int err;
mcbuf.max = &s[n];
mcbuf.aft = &s[0];
mcbuf.bp = bp;
mcbuf.col = bp->col;
mcbuf.cep = ELEM_BADCHAR;
mcbuf.ch = '\0';
if ((err = mcce(&mcbuf, ELEM_BADCHAR, s, mb_cur_max, 1)) != 0)
return err;
if (mcbuf.cep == ELEM_BADCHAR || mcbuf.aft != mcbuf.max)
return BKT_EEQUIV;
last = mcbuf.wc;
if (mcbuf.cep != ELEM_ENCODED && mcbuf.col->nweight > 1)
{
const CollElem *cep;
/*
* The first and last weight[0] values for equivalence
* classes are stuffed into the terminator for the
* multiple character lists. If these values are
* scattered (elements that are not part of this
* equivalence class have weight[0] values between the
* two end points), then SUBN_SPECIAL is placed in
* this terminator. Note that weight[1] of the
* terminator must be other than WGHT_IGNORE, too.
*/
last = mcbuf.cep->weight[0];
if ((cep = libuxre_collmult(bp->col, mcbuf.cep, 0))
!= ELEM_BADCHAR
&& cep->weight[1] != WGHT_IGNORE)
{
last = cep->weight[1];
if (cep->subnbeg == SUBN_SPECIAL)
{
unsigned int nq;
/*
* Permit ranges up to the first and
* after the last.
*/
if (prev > 0 && prev != cep->weight[0]
&& (prev = addrange(bp,
cep->weight[0], prev)) != 0)
{
return prev;
}
/*
* Record the equivalence class by storing
* the primary weight.
*/
if ((nq = bp->nquiv) < NQUIV)
bp->quiv[nq] = mcbuf.cep->weight[1];
else
{
if (nq % NQUIV == 0 && (bp->exquiv =
realloc(bp->exquiv,
nq * sizeof(wuchar_type)))
== 0)
{
return REG_ESPACE;
}
nq -= NQUIV;
bp->exquiv[nq] = mcbuf.cep->weight[1];
}
bp->nquiv++;
return last;
}
mcbuf.cep = cep;
}
mcbuf.wc = mcbuf.cep->weight[0];
}
/*
* Determine range, if any, to install.
*
* If there's a pending low (prev > 0), then try to use it.
*
* Otherwise, try to use mcbuf.wc as the low end of the range.
* Since addrange() assumes that the low point has already been
* placed, we try to fool it by using a prev of one less than
* mcbuf.wc. But, if that value would not look like a valid
* low point of a range, we have to explicitly place mcbuf.wc.
*/
if (prev <= 0 && (prev = mcbuf.wc - 1) <= 0)
{
if ((prev = addrange(bp, mcbuf.wc, 0)) != 0)
return prev;
}
if ((mcbuf.wc = addrange(bp, last, prev)) != 0)
return mcbuf.wc;
return last;
}
static w_type
clsym(Bracket *bp, const unsigned char *s, int n, w_type prev, int mb_cur_max)
{
Mcce mcbuf;
int err;
mcbuf.max = &s[n];
mcbuf.aft = &s[0];
mcbuf.bp = bp;
mcbuf.col = bp->col;
mcbuf.cep = ELEM_BADCHAR;
mcbuf.ch = '\0';
if ((err = mcce(&mcbuf, ELEM_BADCHAR, s, mb_cur_max, 1)) != 0)
return err;
if (mcbuf.cep == ELEM_BADCHAR || mcbuf.aft != mcbuf.max)
return BKT_ECOLLATE;
if (mcbuf.cep != ELEM_ENCODED)
mcbuf.wc = mcbuf.cep->weight[0];
if ((err = addrange(bp, mcbuf.wc, prev)) != 0)
return err;
return mcbuf.wc;
}
/*
* Scans the rest of a bracket construction within a regular
* expression and fills in a description for it.
* The leading [ and the optional set complement indicator
* were handled already by the caller.
* Returns:
* <0 error (a BKT_* value)
* >0 success; equals how many bytes were scanned.
*/
LIBUXRE_STATIC int
libuxre_bktmbcomp(Bracket *bp, const unsigned char *pat0,
int flags, int mb_cur_max)
{
static const Bracket zero = {0};
const unsigned char *pat = pat0;
struct lc_collate *savecol;
w_type n, wc, prev = 0;
/*
* Set represented set to empty. Easiest to copy an empty
* version over the caller's, (re)setting col and flags.
*/
savecol = bp->col;
*bp = zero;
bp->col = savecol;
bp->flags = flags
& (BKT_NEGATED | BKT_ONECASE | BKT_NOTNL | BKT_BADRANGE |
BKT_ODDRANGE);
/*
* Handle optional "empty" brackets; typically only used
* in combination with BKT_QUOTE or BKT_ESCAPE.
*/
if ((wc = *pat) == ']' && (flags & BKT_EMPTY) != 0)
return 1;
/*
* Populate *bp.
*/
for (;; prev = n)
{
switch (wc)
{
case '\0':
ebrack:;
n = BKT_EBRACK;
goto err;
case '\n':
if (flags & BKT_NLBAD)
goto ebrack;
goto regular;
case '/':
if (flags & BKT_SLASHBAD)
goto ebrack;
goto regular;
case '\\':
if ((flags & (BKT_ESCAPE | BKT_QUOTE
| BKT_ESCNL | BKT_ESCSEQ)) == 0)
{
goto regular;
}
switch (wc = *++pat)
{
default:
noesc:;
if ((flags & BKT_ESCAPE) == 0)
{
wc = '\\';
pat--;
}
break;
case '\\':
case ']':
case '-':
case '^':
if ((flags & BKT_QUOTE) == 0)
goto noesc;
break;
case 'a':
if ((flags & BKT_ESCSEQ) == 0 ||
(flags & BKT_OLDESC))
goto noesc;
wc = '\a';
break;
case 'b':
if ((flags & BKT_ESCSEQ) == 0)
goto noesc;
wc = '\b';
break;
case 'f':
if ((flags & BKT_ESCSEQ) == 0)
goto noesc;
wc = '\f';
break;
case 'n':
if ((flags & (BKT_ESCSEQ | BKT_ESCNL)) == 0)
goto noesc;
wc = '\n';
break;
case 'r':
if ((flags & BKT_ESCSEQ) == 0)
goto noesc;
wc = '\r';
break;
case 't':
if ((flags & BKT_ESCSEQ) == 0)
goto noesc;
wc = '\t';
break;
case 'v':
if ((flags & BKT_ESCSEQ) == 0 ||
(flags & BKT_OLDESC))
goto noesc;
wc = '\v';
break;
case 'x':
if ((flags & BKT_ESCSEQ) == 0 ||
(flags & BKT_OLDESC))
goto noesc;
if (!isxdigit(wc = *++pat))
{
pat--;
goto noesc;
}
/*
* Take as many hex digits as possible,
* ignoring overflows.
* Any positive result is okay.
*/
n = 0;
do
{
if (isdigit(wc))
wc -= '0';
else if (isupper(wc))
wc -= 'A' + 10;
else
wc -= 'a' + 10;
n <<= 4;
n |= wc;
} while (isxdigit(wc = *++pat));
pat--;
if ((wc = n) <= 0)
{
n = BKT_BADESC;
goto err;
}
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
if ((flags & BKT_ESCSEQ) == 0 ||
(flags & BKT_OLDESC))
goto noesc;
/*
* For compatibility (w/awk),
* permit "octal" 8 and 9.
*/
n = wc - '0';
if ((wc = *++pat) >= '0' && wc <= '9')
{
n <<= 3;
n += wc - '0';
if ((wc = *++pat) >= '0' && wc <= '9')
{
n <<= 3;
n += wc - '0';
}
}
pat--;
if ((wc = n) <= 0)
{
n = BKT_BADESC;
goto err;
}
break;
}
goto regular;
case '[':
if (((wc = *++pat) == ':' || wc == '=' || wc == '.') &&
(flags & BKT_NOI18N) == 0)
{
n = 0;
while (*++pat != wc || pat[1] != ']')
{
if (*pat == '\0')
{
badpat:;
n = BKT_BADPAT;
goto err;
}
else if (*pat == '/')
{
if (flags & BKT_SLASHBAD)
goto badpat;
}
else if (*pat == '\n')
{
if (flags & BKT_NLBAD)
goto badpat;
}
n++;
}
if (n == 0)
{
n = BKT_EMPTYSUBBKT;
goto err;
}
if (wc == ':')
n = chcls(bp, &pat[-n], n);
else if (wc == '=')
n = eqcls(bp, &pat[-n], n, prev,
mb_cur_max);
else /* wc == '.' */
n = clsym(bp, &pat[-n], n, prev,
mb_cur_max);
pat++;
break;
}
wc = '[';
pat--;
goto regular;
default:
if (!ISONEBYTE(wc) &&
(n = libuxre_mb2wc(&wc, pat + 1)) > 0)
pat += n;
regular:;
n = place(bp, wc, prev, mb_cur_max);
break;
}
if (n < 0) {
n = BKT_ILLSEQ;
goto err;
}
if ((wc = *++pat) == ']')
break;
if (wc == '-' && n != 0)
{
if (prev == 0 || (flags & BKT_SEPRANGE) == 0)
{
if ((wc = *++pat) != ']')
continue; /* valid range */
wc = '-';
pat--;
}
}
n = 0; /* no range this time */
}
return pat - pat0 + 1;
err:;
libuxre_bktfree(bp);
return n;
}
LIBUXRE_STATIC void
libuxre_bktfree(Bracket *bp)
{
if (bp->extype != 0)
free(bp->extype);
if (bp->exquiv != 0)
free(bp->exquiv);
if (bp->exwide != 0)
free(bp->exwide);
}
LIBUXRE_STATIC int
libuxre_bktmbexec(Bracket *bp, wchar_t wc,
const unsigned char *str, int mb_cur_max)
{
unsigned int i;
wchar_t lc, uc;
Mcce mcbuf;
mcbuf.aft = str; /* in case of match in character classes */
mcbuf.ch = wc;
/*
* First: check the single wc against any character classes.
* Since multiple character collating elements are not part
* of this world, they don't apply here.
*/
if ((i = bp->ntype) != 0)
{
wctype_t *wctp = &bp->type[0];
if (bp->flags & BKT_ONECASE)
{
if ((wc = to_lower(wc)) == mcbuf.ch)
mcbuf.ch = to_upper(wc);
}
for (;;)
{
if (iswctype(mb_cur_max==1?btowc(wc):wc, *wctp))
goto match;
if (wc != mcbuf.ch &&
iswctype(mb_cur_max==1?btowc(mcbuf.ch):mcbuf.ch,
*wctp))
goto match;
if (--i == 0)
break;
if (++wctp == &bp->type[NTYPE])
wctp = &bp->extype[0];
}
}
/*
* The main match is determined by the weight[0] value
* of the character (or characters, if the input can be
* taken as a multiple character collating element).
*/
mcbuf.max = 0;
mcbuf.bp = bp;
mcbuf.col = bp->col;
mcbuf.cep = ELEM_BADCHAR;
mcce(&mcbuf, ELEM_BADCHAR, str, mb_cur_max, 0);
if (mcbuf.cep == ELEM_BADCHAR)
return -1; /* never matches */
if (mcbuf.cep != ELEM_ENCODED)
mcbuf.wc = mcbuf.cep->weight[0];
/*
* POSIX.2 demands that both a character and its case counterpart
* can match if REG_ICASE is set. This means that [B-z] matches
* 'A', 'a', and '['.
*/
if (bp->flags & BKT_ONECASE)
{
lc = to_lower(mcbuf.wc);
uc = to_upper(mcbuf.wc);
}
else
lc = uc = mcbuf.wc;
/*
* See if it's in the set. Note that the list of true wide
* character values has explicit ranges.
*/
if (mcbuf.wc <= UCHAR_MAX)
{
if (bp->byte[PLIND(lc)] & PLBIT(lc))
goto match;
if (lc != uc && (bp->byte[PLIND(uc)] & PLBIT(uc)))
goto match;
}
else if ((i = bp->nwide) != 0)
{
wchar_t *wcp = &bp->wide[0];
long lcmp, ucmp;
for (;;)
{
if ((lcmp = lc - *wcp) == 0)
goto match;
ucmp = uc - *wcp;
if (lc != uc && ucmp == 0)
goto match;
if (--i == 0)
break;
if (++wcp == &bp->wide[NWIDE])
wcp = &bp->exwide[0];
if (*wcp == RANGE)
{
if (++wcp == &bp->wide[NWIDE])
wcp = &bp->exwide[0];
if (lcmp > 0 && lc <= *wcp)
goto match;
if (lc != uc && ucmp > 0 && uc < *wcp)
goto match;
if ((i -= 2) == 0)
break;
if (++wcp == &bp->wide[NWIDE])
wcp = &bp->exwide[0];
}
}
}
/*
* The last chance for a match is if an equivalence class
* was specified for which the primary weights are scattered
* through the weight[0]s.
*/
if ((i = bp->nquiv) != 0 && mcbuf.cep != ELEM_ENCODED)
{
wuchar_type *wucp = &bp->quiv[0];
mcbuf.wc = mcbuf.cep->weight[1];
for (;;)
{
if (mcbuf.wc == *wucp)
goto match;
if (--i == 0)
break;
if (++wucp == &bp->quiv[NQUIV])
wucp = &bp->exquiv[0];
}
}
/*
* Only here when no match against the set was found.
* One final special case w/r/t newline.
*/
if (bp->flags & BKT_NEGATED)
{
if (wc != '\n' || (bp->flags & BKT_NOTNL) == 0)
return mcbuf.aft - str;
}
return -1;
match:;
/*
* Only here when a match against the described set is found.
*/
if (bp->flags & BKT_NEGATED)
return -1;
return mcbuf.aft - str;
}