Subversion Repositories planix.SVN

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 - 1
#ifdef PLAN9
2
#include	<u.h>
3
#include	<libc.h>
4
#include	<bio.h>
5
#else
6
#include	<sys/types.h>
7
#include	<stdio.h>
8
#include	<stdlib.h>
9
#include	<string.h>
10
#include	<unistd.h>
11
#include	<errno.h>
12
#include	"plan9.h"
13
#endif
14
#include	"hdr.h"
15
 
16
/*
17
	the our_* routines are implementations for the corresponding library
18
	routines. for a while, i tried to actually name them wctomb etc
19
	but stopped that after i found a system which made wchar_t an
20
	unsigned char.
21
*/
22
 
23
int our_wctomb(char *s, unsigned long wc);
24
int our_mbtowc(unsigned long *p, char *s, unsigned n);
25
int runetoisoutf(char *str, Rune *rune);
26
int fullisorune(char *str, int n);
27
int isochartorune(Rune *rune, char *str);
28
 
29
void
30
utf_in(int fd, long *notused, struct convert *out)
31
{
32
	char buf[N];
33
	int i, j, c, n, tot;
34
	ulong l;
35
 
36
	USED(notused);
37
	tot = 0;
38
	while((n = read(fd, buf+tot, N-tot)) >= 0){
39
		tot += n;
40
		for(i=j=0; i<=tot-UTFmax || (i<tot && (n==0 || fullrune(buf+i, tot-i))); ){
41
			c = our_mbtowc(&l, buf+i, tot-i);
42
			if(c == -1){
43
				if(squawk)
44
					EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
45
				if(clean){
46
					i++;
47
					continue;
48
				}
49
				nerrors++;
50
				l = Runeerror;
51
				c = 1;
52
			}
53
			runes[j++] = l;
54
			i += c;
55
		}
56
		OUT(out, runes, j);
57
		tot -= i;
58
		ninput += i;
59
		if(tot)
60
			memmove(buf, buf+i, tot);
61
		if(n == 0)
62
			break;
63
	}
64
	OUT(out, runes, 0);
65
}
66
 
67
void
68
utf_out(Rune *base, int n, long *notused)
69
{
70
	char *p;
71
	Rune *r;
72
 
73
	USED(notused);
74
	nrunes += n;
75
	for(r = base, p = obuf; n-- > 0; r++){
76
		p += our_wctomb(p, *r);
77
	}
78
	noutput += p-obuf;
79
	write(1, obuf, p-obuf);
80
}
81
 
82
void
83
isoutf_in(int fd, long *notused, struct convert *out)
84
{
85
	char buf[N];
86
	int i, j, c, n, tot;
87
 
88
	USED(notused);
89
	tot = 0;
90
	while((n = read(fd, buf+tot, N-tot)) >= 0){
91
		tot += n;
92
		for(i=j=0; i<tot; ){
93
			if(!fullisorune(buf+i, tot-i))
94
				break;
95
			c = isochartorune(&runes[j], buf+i);
96
			if(runes[j] == Runeerror && c == 1){
97
				if(squawk)
98
					EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
99
				if(clean){
100
					i++;
101
					continue;
102
				}
103
				nerrors++;
104
			}
105
			j++;
106
			i += c;
107
		}
108
		OUT(out, runes, j);
109
		tot -= i;
110
		ninput += i;
111
		if(tot)
112
			memmove(buf, buf+i, tot);
113
		if(n == 0)
114
			break;
115
	}
116
	OUT(out, runes, 0);
117
}
118
 
119
void
120
isoutf_out(Rune *base, int n, long *notused)
121
{
122
	char *p;
123
	Rune *r;
124
 
125
	USED(notused);
126
	nrunes += n;
127
	for(r = base, p = obuf; n-- > 0; r++)
128
		p += runetoisoutf(p, r);
129
	noutput += p-obuf;
130
	write(1, obuf, p-obuf);
131
}
132
 
133
 
134
enum
135
{
136
	Char1	= Runeself,	Rune1	= Runeself,
137
	Char21	= 0xA1,		Rune21	= 0x0100,
138
	Char22	= 0xF6,		Rune22	= 0x4016,
139
	Char3	= 0xFC,		Rune3	= 0x10000,	/* really 0x38E2E */
140
	Esc	= 0xBE,		Bad	= Runeerror
141
};
142
 
143
static	uchar	U[256];
144
static	uchar	T[256];
145
 
146
static
147
void
148
mktable(void)
149
{
150
	int i, u;
151
 
152
	for(i=0; i<256; i++) {
153
		u = i + (0x5E - 0xA0);
154
		if(i < 0xA0)
155
			u = i + (0xDF - 0x7F);
156
		if(i < 0x7F)
157
			u = i + (0x00 - 0x21);
158
		if(i < 0x21)
159
			u = i + (0xBE - 0x00);
160
		U[i] = u;
161
		T[u] = i;
162
	}
163
}
164
 
165
int
166
isochartorune(Rune *rune, char *str)
167
{
168
	int c, c1, c2;
169
	long l;
170
 
171
	if(U[0] == 0)
172
		mktable();
173
 
174
	/*
175
	 * one character sequence
176
	 *	00000-0009F => 00-9F
177
	 */
178
	c = *(uchar*)str;
179
	if(c < Char1) {
180
		*rune = c;
181
		return 1;
182
	}
183
 
184
	/*
185
	 * two character sequence
186
	 *	000A0-000FF => A0; A0-FF
187
	 */
188
	c1 = *(uchar*)(str+1);
189
	if(c < Char21) {
190
		if(c1 >= Rune1 && c1 < Rune21) {
191
			*rune = c1;
192
			return 2;
193
		}
194
		goto bad;
195
	}
196
 
197
	/*
198
	 * two character sequence
199
	 *	00100-04015 => A1-F5; 21-7E/A0-FF
200
	 */
201
	c1 = U[c1];
202
	if(c1 >= Esc)
203
		goto bad;
204
	if(c < Char22) {
205
		*rune =  (c-Char21)*Esc + c1 + Rune21;
206
		return 2;
207
	}
208
 
209
	/*
210
	 * three character sequence
211
	 *	04016-38E2D => A6-FB; 21-7E/A0-FF
212
	 */
213
	c2 = U[*(uchar*)(str+2)];
214
	if(c2 >= Esc)
215
		goto bad;
216
	if(c < Char3) {
217
		l = (c-Char22)*Esc*Esc + c1*Esc + c2 + Rune22;
218
		if(l >= Rune3)
219
			goto bad;
220
		*rune = l;
221
		return 3;
222
	}
223
 
224
	/*
225
	 * bad decoding
226
	 */
227
bad:
228
	*rune = Bad;
229
	return 1;
230
}
231
 
232
int
233
runetoisoutf(char *str, Rune *rune)
234
{
235
	long c;
236
 
237
	if(T[0] == 0)
238
		mktable();
239
 
240
	/*
241
	 * one character sequence
242
	 *	00000-0009F => 00-9F
243
	 */
244
	c = *rune;
245
	if(c < Rune1) {
246
		str[0] = c;
247
		return 1;
248
	}
249
 
250
	/*
251
	 * two character sequence
252
	 *	000A0-000FF => A0; A0-FF
253
	 */
254
	if(c < Rune21) {
255
		str[0] = Char1;
256
		str[1] = c;
257
		return 2;
258
	}
259
 
260
	/*
261
	 * two character sequence
262
	 *	00100-04015 => A1-F5; 21-7E/A0-FF
263
	 */
264
	if(c < Rune22) {
265
		c -= Rune21;
266
		str[0] = c/Esc + Char21;
267
		str[1] = T[c%Esc];
268
		return 2;
269
	}
270
 
271
	/*
272
	 * three character sequence
273
	 *	04016-38E2D => A6-FB; 21-7E/A0-FF
274
	 */
275
	c -= Rune22;
276
	str[0] = c/(Esc*Esc) + Char22;
277
	str[1] = T[c/Esc%Esc];
278
	str[2] = T[c%Esc];
279
	return 3;
280
}
281
 
282
int
283
fullisorune(char *str, int n)
284
{
285
	int c;
286
 
287
	if(n > 0) {
288
		c = *(uchar*)str;
289
		if(c < Char1)
290
			return 1;
291
		if(n > 1)
292
			if(c < Char22 || n > 2)
293
				return 1;
294
	}
295
	return 0;
296
}
297
 
298
#ifdef PLAN9
299
int	errno;
300
#endif
301
 
302
enum
303
{
304
	T1	= 0x00,
305
	Tx	= 0x80,
306
	T2	= 0xC0,
307
	T3	= 0xE0,
308
	T4	= 0xF0,
309
	T5	= 0xF8,
310
	T6	= 0xFC,
311
 
312
	Bit1	= 7,
313
	Bitx	= 6,
314
	Bit2	= 5,
315
	Bit3	= 4,
316
	Bit4	= 3,
317
	Bit5	= 2,
318
	Bit6	= 2,
319
 
320
	Mask1	= (1<<Bit1)-1,
321
	Maskx	= (1<<Bitx)-1,
322
	Mask2	= (1<<Bit2)-1,
323
	Mask3	= (1<<Bit3)-1,
324
	Mask4	= (1<<Bit4)-1,
325
	Mask5	= (1<<Bit5)-1,
326
	Mask6	= (1<<Bit6)-1,
327
 
328
	Wchar1	= (1UL<<Bit1)-1,
329
	Wchar2	= (1UL<<(Bit2+Bitx))-1,
330
	Wchar3	= (1UL<<(Bit3+2*Bitx))-1,
331
	Wchar4	= (1UL<<(Bit4+3*Bitx))-1,
332
	Wchar5	= (1UL<<(Bit5+4*Bitx))-1,
333
 
334
#ifndef	EILSEQ
335
	EILSEQ	= 123,
336
#endif /* EILSEQ */
337
};
338
 
339
int
340
our_wctomb(char *s, unsigned long wc)
341
{
342
	if(s == 0)
343
		return 0;		/* no shift states */
344
	if(wc & ~Wchar2) {
345
		if(wc & ~Wchar4) {
346
			if(wc & ~Wchar5) {
347
				/* 6 bytes */
348
				s[0] = T6 | ((wc >> 5*Bitx) & Mask6);
349
				s[1] = Tx | ((wc >> 4*Bitx) & Maskx);
350
				s[2] = Tx | ((wc >> 3*Bitx) & Maskx);
351
				s[3] = Tx | ((wc >> 2*Bitx) & Maskx);
352
				s[4] = Tx | ((wc >> 1*Bitx) & Maskx);
353
				s[5] = Tx |  (wc & Maskx);
354
				return 6;
355
			}
356
			/* 5 bytes */
357
			s[0] = T5 |  (wc >> 4*Bitx);
358
			s[1] = Tx | ((wc >> 3*Bitx) & Maskx);
359
			s[2] = Tx | ((wc >> 2*Bitx) & Maskx);
360
			s[3] = Tx | ((wc >> 1*Bitx) & Maskx);
361
			s[4] = Tx |  (wc & Maskx);
362
			return 5;
363
		}
364
		if(wc & ~Wchar3) {
365
			/* 4 bytes */
366
			s[0] = T4 |  (wc >> 3*Bitx);
367
			s[1] = Tx | ((wc >> 2*Bitx) & Maskx);
368
			s[2] = Tx | ((wc >> 1*Bitx) & Maskx);
369
			s[3] = Tx |  (wc & Maskx);
370
			return 4;
371
		}
372
		/* 3 bytes */
373
		s[0] = T3 |  (wc >> 2*Bitx);
374
		s[1] = Tx | ((wc >> 1*Bitx) & Maskx);
375
		s[2] = Tx |  (wc & Maskx);
376
		return 3;
377
	}
378
	if(wc & ~Wchar1) {
379
		/* 2 bytes */
380
		s[0] = T2 | (wc >> 1*Bitx);
381
		s[1] = Tx | (wc & Maskx);
382
		return 2;
383
	}
384
	/* 1 byte */
385
	s[0] = T1 | wc;
386
	return 1;
387
}
388
 
389
int
390
our_mbtowc(unsigned long *p, char *s, unsigned n)
391
{
392
	uchar *us;
393
	int c0, c1, c2, c3, c4, c5;
394
	unsigned long wc;
395
 
396
	if(s == 0)
397
		return 0;		/* no shift states */
398
 
399
	if(n < 1)
400
		goto bad;
401
	us = (uchar*)s;
402
	c0 = us[0];
403
	if(c0 >= T3) {
404
		if(n < 3)
405
			goto bad;
406
		c1 = us[1] ^ Tx;
407
		c2 = us[2] ^ Tx;
408
		if((c1|c2) & T2)
409
			goto bad;
410
		if(c0 >= T5) {
411
			if(n < 5)
412
				goto bad;
413
			c3 = us[3] ^ Tx;
414
			c4 = us[4] ^ Tx;
415
			if((c3|c4) & T2)
416
				goto bad;
417
			if(c0 >= T6) {
418
				/* 6 bytes */
419
				if(n < 6)
420
					goto bad;
421
				c5 = us[5] ^ Tx;
422
				if(c5 & T2)
423
					goto bad;
424
				wc = ((((((((((c0 & Mask6) << Bitx) |
425
					c1) << Bitx) | c2) << Bitx) |
426
					c3) << Bitx) | c4) << Bitx) | c5;
427
				if(wc <= Wchar5)
428
					goto bad;
429
				*p = wc;
430
				return 6;
431
			}
432
			/* 5 bytes */
433
			wc = ((((((((c0 & Mask5) << Bitx) |
434
				c1) << Bitx) | c2) << Bitx) |
435
				c3) << Bitx) | c4;
436
			if(wc <= Wchar4)
437
				goto bad;
438
			*p = wc;
439
			return 5;
440
		}
441
		if(c0 >= T4) {
442
			/* 4 bytes */
443
			if(n < 4)
444
				goto bad;
445
			c3 = us[3] ^ Tx;
446
			if(c3 & T2)
447
				goto bad;
448
			wc = ((((((c0 & Mask4) << Bitx) |
449
				c1) << Bitx) | c2) << Bitx) |
450
				c3;
451
			if(wc <= Wchar3)
452
				goto bad;
453
			*p = wc;
454
			return 4;
455
		}
456
		/* 3 bytes */
457
		wc = ((((c0 & Mask3) << Bitx) |
458
			c1) << Bitx) | c2;
459
		if(wc <= Wchar2)
460
			goto bad;
461
		*p = wc;
462
		return 3;
463
	}
464
	if(c0 >= T2) {
465
		/* 2 bytes */
466
		if(n < 2)
467
			goto bad;
468
		c1 = us[1] ^ Tx;
469
		if(c1 & T2)
470
			goto bad;
471
		wc = ((c0 & Mask2) << Bitx) |
472
			c1;
473
		if(wc <= Wchar1)
474
			goto bad;
475
		*p = wc;
476
		return 2;
477
	}
478
	/* 1 byte */
479
	if(c0 >= Tx)
480
		goto bad;
481
	*p = c0;
482
	return 1;
483
 
484
bad:
485
	errno = EILSEQ;
486
	return -1;
487
}