Subversion Repositories tendra.SVN

Rev

Rev 5 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 7u83 1
/*
6 7u83 2
 * Copyright (c) 2002-2005 The TenDRA Project <http://www.tendra.org/>.
3
 * All rights reserved.
4
 *
5
 * Redistribution and use in source and binary forms, with or without
6
 * modification, are permitted provided that the following conditions are met:
7
 *
8
 * 1. Redistributions of source code must retain the above copyright notice,
9
 *    this list of conditions and the following disclaimer.
10
 * 2. Redistributions in binary form must reproduce the above copyright notice,
11
 *    this list of conditions and the following disclaimer in the documentation
12
 *    and/or other materials provided with the distribution.
13
 * 3. Neither the name of The TenDRA Project nor the names of its contributors
14
 *    may be used to endorse or promote products derived from this software
15
 *    without specific, prior written permission.
16
 *
17
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS
18
 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
19
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
20
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR
21
 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
22
 * EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
23
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
24
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
25
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
26
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
27
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
28
 *
29
 * $Id$
30
 */
31
/*
2 7u83 32
    		 Crown Copyright (c) 1997
6 7u83 33
 
2 7u83 34
    This TenDRA(r) Computer Program is subject to Copyright
35
    owned by the United Kingdom Secretary of State for Defence
36
    acting through the Defence Evaluation and Research Agency
37
    (DERA).  It is made available to Recipients with a
38
    royalty-free licence for its use, reproduction, transfer
39
    to other parties and amendment for any purpose not excluding
40
    product development provided that any such use et cetera
41
    shall be deemed to be acceptance of the following conditions:-
6 7u83 42
 
2 7u83 43
        (1) Its Recipients shall ensure that this Notice is
44
        reproduced upon any copies or amended versions of it;
6 7u83 45
 
2 7u83 46
        (2) Any amended version of it shall be clearly marked to
47
        show both the nature of and the organisation responsible
48
        for the relevant amendment or amendments;
6 7u83 49
 
2 7u83 50
        (3) Its onward transfer from a recipient to another
51
        party shall be deemed to be that party's acceptance of
52
        these conditions;
6 7u83 53
 
2 7u83 54
        (4) DERA gives no warranty or assurance as to its
55
        quality or suitability for any purpose and DERA accepts
56
        no liability whatsoever in relation to any use to which
57
        it may be put.
58
*/
59
 
60
 
61
#include "config.h"
62
#include <limits.h>
63
#include "c_types.h"
64
#include "char.h"
65
#include "literal.h"
66
 
67
 
68
/*
69
    ASCII FLAG
70
 
71
    This flag is true if the native codeset is ASCII.  It is initially -1
72
    indicating that the conversion tables have not been initialised.
73
*/
74
 
6 7u83 75
int is_ascii = -1;
2 7u83 76
 
77
 
78
/*
79
    CONVERSION TABLES
80
 
81
    These tables give the conversions of ASCII to and from the native
82
    codeset.  The ASCII to native table is deduced for the portable
83
    codeset from the list of characters in char.h which is given in
84
    ASCII sequence.
85
*/
86
 
87
 
6 7u83 88
static character to_ascii_tab[NO_CHAR + 1];
2 7u83 89
 
6 7u83 90
static character from_ascii_tab[NO_CHAR + 1] = {
2 7u83 91
#define NONE			char_illegal
6 7u83 92
#define CHAR_DATA(A, B, C, D)	(D),
2 7u83 93
#include "char.h"
94
#undef CHAR_DATA
6 7u83 95
	NONE			/* dummy last element */
96
};
2 7u83 97
 
98
 
99
/*
100
    CONVERT A CHARACTER TO ASCII
101
 
102
    This routine converts the character c from the native codeset to
103
    ASCII.
104
*/
105
 
6 7u83 106
unsigned long
107
to_ascii(unsigned long c, int *ch)
2 7u83 108
{
6 7u83 109
	if (c < NO_CHAR) {
110
		c = (unsigned long)to_ascii_tab[c];
111
		*ch = CHAR_ASCII;
112
	}
113
	return(c);
2 7u83 114
}
115
 
116
 
117
/*
118
    CONVERT A CHARACTER TO NATIVE CODESET
119
 
120
    This routine converts the character c to the native codeset from
121
    ASCII.
122
*/
123
 
6 7u83 124
unsigned long
125
from_ascii(unsigned long c, int *ch)
2 7u83 126
{
6 7u83 127
	if (c < NO_CHAR) {
128
		c = (unsigned long)from_ascii_tab[c];
129
		*ch = CHAR_SIMPLE;
130
	}
131
	return(c);
2 7u83 132
}
133
 
134
 
135
/*
136
    TRANSFORM A TABLE FROM ASCII TO NATIVE CODESET
137
 
138
    This routine maps the look-up table p from ASCII to native codeset.
139
*/
140
 
6 7u83 141
void
142
map_ascii(unsigned char *p)
2 7u83 143
{
6 7u83 144
	unsigned c;
145
	int asc = is_ascii;
146
	if (asc == -1) {
147
		/* Set up conversion tables */
148
		asc = 1;
149
		for (c = 0; c < NO_CHAR; c++) {
150
			to_ascii_tab[c] = (character)c;
151
		}
152
		for (c = 0; c < NO_CHAR; c++) {
153
			unsigned a = (unsigned)from_ascii_tab[c];
154
			if (a == NONE) {
155
				a = c;
156
			}
157
			if (a != c) {
158
				asc = 0;
159
			}
160
			from_ascii_tab[c] = (character)a;
161
			to_ascii_tab[a] = (character)c;
162
		}
163
		is_ascii = asc;
2 7u83 164
	}
165
 
6 7u83 166
	if (asc == 0) {
167
		/* Map table */
168
		unsigned char b = p[NONE];
169
		unsigned char copy[NO_CHAR];
170
		for (c = 0; c < NO_CHAR; c++) {
171
			copy[c] = p[c];
172
			p[c] = b;
173
		}
174
		SET(copy);
175
		for (c = 0; c < NO_CHAR; c++) {
176
			character a = from_ascii_tab[c];
177
			p[a] = copy[c];
178
		}
2 7u83 179
	}
6 7u83 180
	return;
2 7u83 181
}
182
 
183
 
184
/*
185
    TABLE OF ALPHABETIC UNICODE CHARACTERS
186
 
187
    This table gives the ranges of the unicode characters which are
188
    suitable for use in an identifier name in ascending order.  It is
189
    derived from Annex E of the standard (which derives from ISO/IEC
190
    PDTR 10176) with two misprints corrected.  0e0d should be 0e8d
191
    and 5e76 should be fe76.  Note that the upper and lower case
192
    English alphabets have been included even though they are not
193
    valid universal character names.
194
*/
195
 
196
static struct {
6 7u83 197
	unsigned long lo;
198
	unsigned long hi;
199
} alpha_range[] = {
200
	/* Latin */
201
	{ 0x0041, 0x005a }, { 0x0061, 0x007a }, { 0x00c0, 0x00d6 },
202
	{ 0x00d8, 0x00f6 }, { 0x00f8, 0x01f5 }, { 0x01fa, 0x0217 },
203
	{ 0x0250, 0x02a8 },
2 7u83 204
 
6 7u83 205
	/* Greek */
206
	{ 0x0384, 0x0384 }, { 0x0388, 0x038a }, { 0x038c, 0x038c },
207
	{ 0x038e, 0x03a1 }, { 0x03a3, 0x03ce }, { 0x03d0, 0x03d6 },
208
	{ 0x03da, 0x03da }, { 0x03dc, 0x03dc }, { 0x03de, 0x03de },
209
	{ 0x03e0, 0x03e0 }, { 0x03e2, 0x03f3 },
2 7u83 210
 
6 7u83 211
	/* Cyrilic */
212
	{ 0x0401, 0x040d }, { 0x040f, 0x044f }, { 0x0451, 0x045c },
213
	{ 0x045e, 0x0481 }, { 0x0490, 0x04c4 }, { 0x04c7, 0x04c8 },
214
	{ 0x04cb, 0x04cc }, { 0x04d0, 0x04eb }, { 0x04ee, 0x04f5 },
215
	{ 0x04f8, 0x04f9 },
2 7u83 216
 
6 7u83 217
	/* Armenian */
218
	{ 0x0531, 0x0556 }, { 0x0561, 0x0587 },
2 7u83 219
 
6 7u83 220
	/* Hebrew */
221
	{ 0x05d0, 0x05ea }, { 0x05f0, 0x05f4 },
2 7u83 222
 
6 7u83 223
	/* Arabic */
224
	{ 0x0621, 0x063a }, { 0x0640, 0x0652 }, { 0x0670, 0x06b7 },
225
	{ 0x06ba, 0x06be }, { 0x06c0, 0x06ce }, { 0x06e5, 0x06e7 },
2 7u83 226
 
6 7u83 227
	/* Devanagari */
228
	{ 0x0905, 0x0939 }, { 0x0958, 0x0962 },
2 7u83 229
 
6 7u83 230
	/* Bengali */
231
	{ 0x0985, 0x098c }, { 0x098f, 0x0990 }, { 0x0993, 0x09a8 },
232
	{ 0x09aa, 0x09b0 }, { 0x09b2, 0x09b2 }, { 0x09b6, 0x09b9 },
233
	{ 0x09dc, 0x09dd }, { 0x09df, 0x09e1 }, { 0x09f0, 0x09f1 },
2 7u83 234
 
6 7u83 235
	/* Gurmukhi */
236
	{ 0x0a05, 0x0a0a }, { 0x0a0f, 0x0a10 }, { 0x0a13, 0x0a28 },
237
	{ 0x0a2a, 0x0a30 }, { 0x0a32, 0x0a33 }, { 0x0a35, 0x0a36 },
238
	{ 0x0a38, 0x0a39 }, { 0x0a59, 0x0a5c }, { 0x0a5e, 0x0a5e },
2 7u83 239
 
6 7u83 240
	/* Gujarati */
241
	{ 0x0a85, 0x0a8b }, { 0x0a8d, 0x0a8d }, { 0x0a8f, 0x0a91 },
242
	{ 0x0a93, 0x0aa8 }, { 0x0aaa, 0x0ab0 }, { 0x0ab2, 0x0ab3 },
243
	{ 0x0ab5, 0x0ab9 }, { 0x0ae0, 0x0ae0 },
2 7u83 244
 
6 7u83 245
	/* Oriya */
246
	{ 0x0b05, 0x0b0c }, { 0x0b0f, 0x0b10 }, { 0x0b13, 0x0b28 },
247
	{ 0x0b2a, 0x0b30 }, { 0x0b32, 0x0b33 }, { 0x0b36, 0x0b39 },
248
	{ 0x0b5c, 0x0b5d }, { 0x0b5f, 0x0b61 },
2 7u83 249
 
6 7u83 250
	/* Tamil */
251
	{ 0x0b85, 0x0b8a }, { 0x0b8e, 0x0b90 }, { 0x0b92, 0x0b95 },
252
	{ 0x0b99, 0x0b9a }, { 0x0b9c, 0x0b9c }, { 0x0b9e, 0x0b9f },
253
	{ 0x0ba3, 0x0ba4 }, { 0x0ba8, 0x0baa }, { 0x0bae, 0x0bb5 },
254
	{ 0x0bb7, 0x0bb9 },
2 7u83 255
 
6 7u83 256
	/* Telugu */
257
	{ 0x0c05, 0x0c0c }, { 0x0c0e, 0x0c10 }, { 0x0c12, 0x0c28 },
258
	{ 0x0c2a, 0x0c33 }, { 0x0c35, 0x0c39 }, { 0x0c60, 0x0c61 },
2 7u83 259
 
6 7u83 260
	/* Kannada */
261
	{ 0x0c85, 0x0c8c }, { 0x0c8e, 0x0c90 }, { 0x0c92, 0x0ca8 },
262
	{ 0x0caa, 0x0cb3 }, { 0x0cb5, 0x0cb9 }, { 0x0ce0, 0x0ce1 },
2 7u83 263
 
6 7u83 264
	/* Malayalam */
265
	{ 0x0d05, 0x0d0c }, { 0x0d0e, 0x0d10 }, { 0x0d12, 0x0d28 },
266
	{ 0x0d2a, 0x0d39 }, { 0x0d60, 0x0d61 },
2 7u83 267
 
6 7u83 268
	/* Thai */
269
	{ 0x0e01, 0x0e30 }, { 0x0e32, 0x0e33 }, { 0x0e40, 0x0e46 },
270
	{ 0x0e4f, 0x0e5b },
2 7u83 271
 
6 7u83 272
	/* Lao */
273
	{ 0x0e81, 0x0e82 }, { 0x0e84, 0x0e84 }, { 0x0e87, 0x0e87 },
274
	{ 0x0e88, 0x0e88 }, { 0x0e8a, 0x0e8a }, { 0x0e8d, 0x0e8d },
275
	{ 0x0e94, 0x0e97 }, { 0x0e99, 0x0e9f }, { 0x0ea1, 0x0ea3 },
276
	{ 0x0ea5, 0x0ea5 }, { 0x0ea7, 0x0ea7 }, { 0x0eaa, 0x0eaa },
277
	{ 0x0eab, 0x0eab }, { 0x0ead, 0x0eb0 }, { 0x0eb2, 0x0eb2 },
278
	{ 0x0eb3, 0x0eb3 }, { 0x0ebd, 0x0ebd }, { 0x0ec0, 0x0ec4 },
279
	{ 0x0ec6, 0x0ec6 },
2 7u83 280
 
6 7u83 281
	/* Georgian */
282
	{ 0x10a0, 0x10c5 }, { 0x10d0, 0x10f6 },
2 7u83 283
 
6 7u83 284
	/* Hangul */
285
	{ 0x1100, 0x1159 }, { 0x1161, 0x11a2 }, { 0x11a8, 0x11f9 },
2 7u83 286
 
6 7u83 287
	/* Latin (continued) */
288
	{ 0x1e00, 0x1e9a }, { 0x1ea0, 0x1ef9 },
2 7u83 289
 
6 7u83 290
	/* Greek (continued) */
291
	{ 0x1f00, 0x1f15 }, { 0x1f18, 0x1f1d }, { 0x1f20, 0x1f45 },
292
	{ 0x1f48, 0x1f4d }, { 0x1f50, 0x1f57 }, { 0x1f59, 0x1f59 },
293
	{ 0x1f5b, 0x1f5b }, { 0x1f5d, 0x1f5d }, { 0x1f5f, 0x1f7d },
294
	{ 0x1f80, 0x1fb4 }, { 0x1fb6, 0x1fbc }, { 0x1fc2, 0x1fc4 },
295
	{ 0x1fc6, 0x1fcc }, { 0x1fd0, 0x1fd3 }, { 0x1fd6, 0x1fdb },
296
	{ 0x1fe0, 0x1fec }, { 0x1ff2, 0x1ff4 }, { 0x1ff6, 0x1ffc },
2 7u83 297
 
6 7u83 298
	/* Hiragana */
299
	{ 0x3041, 0x3094 }, { 0x309b, 0x309e },
2 7u83 300
 
6 7u83 301
	/* Katakana */
302
	{ 0x30a1, 0x30fe },
2 7u83 303
 
6 7u83 304
	/* Bopmofo */
305
	{ 0x3105, 0x312c },
2 7u83 306
 
6 7u83 307
	/* CJK Unified Ideographs */
2 7u83 308
#if FS_NUMBER_SUFFIX
6 7u83 309
	{ 0x4e00UL, 0x9fa5UL }, { 0xf900UL, 0xfa2dUL }, { 0xfb1fUL, 0xfb36UL },
310
	{ 0xfb38UL, 0xfb3cUL }, { 0xfb3eUL, 0xfb3eUL }, { 0xfb40UL, 0xfb41UL },
311
	{ 0xfb42UL, 0xfb44UL }, { 0xfb46UL, 0xfbb1UL }, { 0xfbd3UL, 0xfd3fUL },
312
	{ 0xfd50UL, 0xfd8fUL }, { 0xfd92UL, 0xfdc7UL }, { 0xfdf0UL, 0xfdfbUL },
313
	{ 0xfe70UL, 0xfe72UL }, { 0xfe74UL, 0xfe74UL }, { 0xfe76UL, 0xfefcUL },
314
	{ 0xff21UL, 0xff3aUL }, { 0xff41UL, 0xff5aUL }, { 0xff66UL, 0xffbeUL },
315
	{ 0xffc2UL, 0xffc7UL }, { 0xffcaUL, 0xffcfUL }, { 0xffd2UL, 0xffd7UL },
316
	{ 0xffdaUL, 0xffdcUL }
2 7u83 317
#else
6 7u83 318
	{ 0x4e00, 0x9fa5 }, { 0xf900, 0xfa2d }, { 0xfb1f, 0xfb36 },
319
	{ 0xfb38, 0xfb3c }, { 0xfb3e, 0xfb3e }, { 0xfb40, 0xfb41 },
320
	{ 0xfb42, 0xfb44 }, { 0xfb46, 0xfbb1 }, { 0xfbd3, 0xfd3f },
321
	{ 0xfd50, 0xfd8f }, { 0xfd92, 0xfdc7 }, { 0xfdf0, 0xfdfb },
322
	{ 0xfe70, 0xfe72 }, { 0xfe74, 0xfe74 }, { 0xfe76, 0xfefc },
323
	{ 0xff21, 0xff3a }, { 0xff41, 0xff5a }, { 0xff66, 0xffbe },
324
	{ 0xffc2, 0xffc7 }, { 0xffca, 0xffcf }, { 0xffd2, 0xffd7 },
325
	{ 0xffda, 0xffdc }
2 7u83 326
#endif
6 7u83 327
};
2 7u83 328
 
329
 
330
/*
331
    DOES A UNICODE CHARACTER REPRESENT AN ALPHABETIC VALUE?
332
 
333
    This routine checks whether the unicode character c represents an
334
    alphabetic value suitable for use in an identifier name.  It
335
    operates by performing a binary chop on the table above.
336
*/
337
 
6 7u83 338
int
339
unicode_alpha(unsigned long c)
2 7u83 340
{
6 7u83 341
	int i = 0;
342
	int j = array_size(alpha_range) - 1;
343
	do {
344
		int k = (i + j) / 2;
345
		if (c < alpha_range[k].lo) {
346
			/* Lower half */
347
			j = k - 1;
348
		} else if (c > alpha_range[k].hi) {
349
			/* Upper half */
350
			i = k + 1;
351
		} else {
352
			/* Match found */
353
			return(1);
354
		}
355
	} while (i <= j);
356
	return(0);
2 7u83 357
}