Blame | Last modification | View Log | RSS feed
/*
* Changes by Gunnar Ritter, Freiburg i. Br., Germany, November 2002.
*
* Sccsid @(#)_collelem.c 1.4 (gritter) 10/18/03
*/
/* UNIX(R) Regular Expresssion Library
*
* Note: Code is released under the GNU LGPL
*
* Copyright (C) 2001 Caldera International, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to:
* Free Software Foundation, Inc.
* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
/* #include "synonyms.h" */
#include "colldata.h"
#include <stddef.h>
#define CCE(p) ((const CollElem *)(p))
#define CCM(p) ((const CollMult *)(p))
LIBUXRE_STATIC const CollElem *
libuxre_collelem(struct lc_collate *col, CollElem *spare, wchar_t wc)
{
const char *tbl;
size_t hi, lo, cur;
const CollMult *cmp;
const CollElem *cep;
long diff;
int sz;
/*
* ELEM_ENCODED is returned when the collation is entirely
* based on the encoded value of the character.
*/
if (col == 0 || col->flags & CHF_ENCODED
|| (tbl = (const char *)col->maintbl) == 0)
{
return ELEM_ENCODED;
}
if ((wuchar_type)wc <= UCHAR_MAX)
{
indexed:;
cep = CCE(&tbl[(wuchar_type)wc * col->elemsize]);
if (cep->weight[0] == WGHT_SPECIAL)
return ELEM_BADCHAR;
return cep;
}
if (col->flags & CHF_INDEXED)
{
if ((wuchar_type)wc >= col->nmain)
return ELEM_BADCHAR;
goto indexed;
}
/*
* Binary search for a match. Could speed up the search if
* some interpolation was used, but keep it simple for now.
* Note that this is actually a table of CollMult's.
*
* To save space in the file, sequences of similar elements
* are sometimes compressed into a single CollMult that
* describes many entries. This is denoted by a subnbeg
* with the SUBN_SPECIAL bit set. The rest of the bits give
* the range covered by this entry.
*/
sz = col->elemsize + (sizeof(CollMult) - sizeof(CollElem));
tbl += (1 + UCHAR_MAX) * col->elemsize;
lo = 0;
hi = col->nmain - UCHAR_MAX;
while (lo < hi)
{
if ((cur = (hi + lo) >> 1) < lo) /* hi+lo overflowed */
cur |= ~(~(size_t)0 >> 1); /* lost high order bit */
cmp = CCM(&tbl[cur * sz]);
if ((diff = wc - cmp->ch) < 0)
hi = cur;
else if (cmp->elem.subnbeg & SUBN_SPECIAL)
{
if (diff > (long)(cmp->elem.subnbeg & ~SUBN_SPECIAL))
lo = cur + 1;
else /* create an entry from the sequence in spare */
{
spare->multbeg = cmp->elem.multbeg;
spare->subnbeg = 0;
spare->weight[0] = cmp->elem.weight[0] + diff;
for (lo = 1; lo < col->nweight; lo++)
{
wuchar_type w;
if ((w = cmp->elem.weight[lo])
== WGHT_SPECIAL)
{
w = spare->weight[0];
}
spare->weight[lo] = w;
}
return spare;
}
}
else if (diff == 0)
return &cmp->elem;
else
lo = cur + 1;
}
return ELEM_BADCHAR;
}