Subversion Repositories planix.SVN

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 - 1
#ifdef	PLAN9
2
#include	<u.h>
3
#include	<libc.h>
4
#include	<bio.h>
5
#else
6
#include	<stdio.h>
7
#include	<unistd.h>
8
#include	"plan9.h"
9
#endif
10
#include	"hdr.h"
11
#include	"conv.h"
12
#include	"kuten208.h"
13
#include	"jis.h"
14
 
15
/*
16
	a state machine for interpreting all sorts of encodings
17
*/
18
static void
19
alljis(int c, Rune **r, long input_loc)
20
{
21
	static enum { state0, state1, state2, state3, state4 } state = state0;
22
	static int set8 = 0;
23
	static int japan646 = 0;
24
	static int lastc;
25
	int n;
26
	long l;
27
 
28
again:
29
	switch(state)
30
	{
31
	case state0:	/* idle state */
32
		if(c == ESC){ state = state1; return; }
33
		if(c < 0) return;
34
		if(!set8 && (c < 128)){
35
			if(japan646){
36
				switch(c)
37
				{
38
				case '\\':	emit(0xA5); return;	/* yen */
39
				case '~':	emit(0xAF); return;	/* spacing macron */
40
				default:	emit(c); return;
41
				}
42
			} else {
43
				emit(c);
44
				return;
45
			}
46
		}
47
		if(c < 0x21){	/* guard against bogus characters in JIS mode */
48
			if(squawk)
49
				EPR "%s: non-JIS character %02x in %s near byte %ld\n", argv0, c, file, input_loc);
50
			emit(c);
51
			return;
52
		}
53
		lastc = c; state = state4; return;
54
 
55
	case state1:	/* seen an escape */
56
		if(c == '$'){ state = state2; return; }
57
		if(c == '('){ state = state3; return; }
58
		emit(ESC); state = state0; goto again;
59
 
60
	case state2:	/* may be shifting into JIS */
61
		if((c == '@') || (c == 'B')){
62
			set8 = 1; state = state0; return;
63
		}
64
		emit(ESC); emit('$'); state = state0; goto again;
65
 
66
	case state3:	/* may be shifting out of JIS */
67
		if((c == 'J') || (c == 'H') || (c == 'B')){
68
			japan646 = (c == 'J');
69
			set8 = 0; state = state0; return;
70
		}
71
		emit(ESC); emit('('); state = state0; goto again;
72
 
73
	case state4:	/* two part char */
74
		if(c < 0){
75
			if(squawk)
76
				EPR "%s: unexpected EOF in %s\n", argv0, file);
77
			c = 0x21 | (lastc&0x80);
78
		}
79
		if(CANS2J(lastc, c)){	/* ms dos sjis */
80
			int hi = lastc, lo = c;
81
			S2J(hi, lo);			/* convert to 208 */
82
			n = hi*100 + lo - 3232;		/* convert to kuten208 */
83
		} else
84
			n = (lastc&0x7F)*100 + (c&0x7f) - 3232;	/* kuten208 */
85
		if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
86
			nerrors++;
87
			if(squawk)
88
				EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
89
			if(!clean)
90
				emit(BADMAP);
91
		} else {
92
			if(l < 0){
93
				l = -l;
94
				if(squawk)
95
					EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
96
			}
97
			emit(l);
98
		}
99
		state = state0;
100
	}
101
}
102
 
103
/*
104
	a state machine for interpreting ms-kanji == shift-jis.
105
*/
106
static void
107
ms(int c, Rune **r, long input_loc)
108
{
109
	static enum { state0, state1, state2, state3, state4 } state = state0;
110
	static int set8 = 0;
111
	static int japan646 = 0;
112
	static int lastc;
113
	int n;
114
	long l;
115
 
116
again:
117
	switch(state)
118
	{
119
	case state0:	/* idle state */
120
		if(c == ESC){ state = state1; return; }
121
		if(c < 0) return;
122
		if(!set8 && (c < 128)){
123
			if(japan646){
124
				switch(c)
125
				{
126
				case '\\':	emit(0xA5); return;	/* yen */
127
				case '~':	emit(0xAF); return;	/* spacing macron */
128
				default:	emit(c); return;
129
				}
130
			} else {
131
				emit(c);
132
				return;
133
			}
134
		}
135
		lastc = c; state = state4; return;
136
 
137
	case state1:	/* seen an escape */
138
		if(c == '$'){ state = state2; return; }
139
		if(c == '('){ state = state3; return; }
140
		emit(ESC); state = state0; goto again;
141
 
142
	case state2:	/* may be shifting into JIS */
143
		if((c == '@') || (c == 'B')){
144
			set8 = 1; state = state0; return;
145
		}
146
		emit(ESC); emit('$'); state = state0; goto again;
147
 
148
	case state3:	/* may be shifting out of JIS */
149
		if((c == 'J') || (c == 'H') || (c == 'B')){
150
			japan646 = (c == 'J');
151
			set8 = 0; state = state0; return;
152
		}
153
		emit(ESC); emit('('); state = state0; goto again;
154
 
155
	case state4:	/* two part char */
156
		if(c < 0){
157
			if(squawk)
158
				EPR "%s: unexpected EOF in %s\n", argv0, file);
159
			c = 0x21 | (lastc&0x80);
160
		}
161
		if(CANS2J(lastc, c)){	/* ms dos sjis */
162
			int hi = lastc, lo = c;
163
			S2J(hi, lo);			/* convert to 208 */
164
			n = hi*100 + lo - 3232;		/* convert to kuten208 */
165
		} else {
166
			nerrors++;
167
			if(squawk)
168
				EPR "%s: illegal byte pair (0x%x,0x%x) near byte %ld in %s\n", argv0, lastc, c, input_loc, file);
169
			if(!clean)
170
				emit(BADMAP);
171
			state = state0;
172
			goto again;
173
		}
174
		if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
175
			nerrors++;
176
			if(squawk)
177
				EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
178
			if(!clean)
179
				emit(BADMAP);
180
		} else {
181
			if(l < 0){
182
				l = -l;
183
				if(squawk)
184
					EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
185
			}
186
			emit(l);
187
		}
188
		state = state0;
189
	}
190
}
191
 
192
/*
193
	a state machine for interpreting ujis == EUC
194
*/
195
static void
196
ujis(int c, Rune **r, long input_loc)
197
{
198
	static enum { state0, state1 } state = state0;
199
	static int lastc;
200
	int n;
201
	long l;
202
 
203
	switch(state)
204
	{
205
	case state0:	/* idle state */
206
		if(c < 0) return;
207
		if(c < 128){
208
			emit(c);
209
			return;
210
		}
211
		if(c == 0x8e){	/* codeset 2 */
212
			nerrors++;
213
			if(squawk)
214
				EPR "%s: unknown codeset 2 near byte %ld in %s\n", argv0, input_loc, file);
215
			if(!clean)
216
				emit(BADMAP);
217
			return;
218
		}
219
		if(c == 0x8f){	/* codeset 3 */
220
			nerrors++;
221
			if(squawk)
222
				EPR "%s: unknown codeset 3 near byte %ld in %s\n", argv0, input_loc, file);
223
			if(!clean)
224
				emit(BADMAP);
225
			return;
226
		}
227
		lastc = c;
228
		state = state1;
229
		return;
230
 
231
	case state1:	/* two part char */
232
		if(c < 0){
233
			if(squawk)
234
				EPR "%s: unexpected EOF in %s\n", argv0, file);
235
			c = 0xA1;
236
		}
237
		n = (lastc&0x7F)*100 + (c&0x7F) - 3232;	/* kuten208 */
238
		if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
239
			nerrors++;
240
			if(squawk)
241
				EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
242
			if(!clean)
243
				emit(BADMAP);
244
		} else {
245
			if(l < 0){
246
				l = -l;
247
				if(squawk)
248
					EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
249
			}
250
			emit(l);
251
		}
252
		state = state0;
253
	}
254
}
255
 
256
/*
257
	a state machine for interpreting jis-kanji == 2022-JP
258
*/
259
static void
260
jis(int c, Rune **r, long input_loc)
261
{
262
	static enum { state0, state1, state2, state3, state4 } state = state0;
263
	static int set8 = 0;
264
	static int japan646 = 0;
265
	static int lastc;
266
	int n;
267
	long l;
268
 
269
again:
270
	switch(state)
271
	{
272
	case state0:	/* idle state */
273
		if(c == ESC){ state = state1; return; }
274
		if(c < 0) return;
275
		if(!set8 && (c < 128)){
276
			if(japan646){
277
				switch(c)
278
				{
279
				case '\\':	emit(0xA5); return;	/* yen */
280
				case '~':	emit(0xAF); return;	/* spacing macron */
281
				default:	emit(c); return;
282
				}
283
			} else {
284
				emit(c);
285
				return;
286
			}
287
		}
288
		lastc = c; state = state4; return;
289
 
290
	case state1:	/* seen an escape */
291
		if(c == '$'){ state = state2; return; }
292
		if(c == '('){ state = state3; return; }
293
		emit(ESC); state = state0; goto again;
294
 
295
	case state2:	/* may be shifting into JIS */
296
		if((c == '@') || (c == 'B')){
297
			set8 = 1; state = state0; return;
298
		}
299
		emit(ESC); emit('$'); state = state0; goto again;
300
 
301
	case state3:	/* may be shifting out of JIS */
302
		if((c == 'J') || (c == 'H') || (c == 'B')){
303
			japan646 = (c == 'J');
304
			set8 = 0; state = state0; return;
305
		}
306
		emit(ESC); emit('('); state = state0; goto again;
307
 
308
	case state4:	/* two part char */
309
		if(c < 0){
310
			if(squawk)
311
				EPR "%s: unexpected EOF in %s\n", argv0, file);
312
			c = 0x21 | (lastc&0x80);
313
		}
314
		if((lastc&0x80) != (c&0x80)){	/* guard against latin1 in jis */
315
			emit(lastc);
316
			state = state0;
317
			goto again;
318
		}
319
		n = (lastc&0x7F)*100 + (c&0x7f) - 3232;	/* kuten208 */
320
		if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
321
			nerrors++;
322
			if(squawk)
323
				EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
324
			if(!clean)
325
				emit(BADMAP);
326
		} else {
327
			if(l < 0){
328
				l = -l;
329
				if(squawk)
330
					EPR "%s: ambiguous kuten208 %d (mapped to 0x%lx) near byte %ld in %s\n", argv0, n, l, input_loc, file);
331
			}
332
			emit(l);
333
		}
334
		state = state0;
335
	}
336
}
337
 
338
static void
339
do_in(int fd, void (*procfn)(int, Rune **, long), struct convert *out)
340
{
341
	Rune ob[N];
342
	Rune *r, *re;
343
	uchar ibuf[N];
344
	int n, i;
345
	long nin;
346
 
347
	r = ob;
348
	re = ob+N-3;
349
	nin = 0;
350
	while((n = read(fd, ibuf, sizeof ibuf)) > 0){
351
		for(i = 0; i < n; i++){
352
			(*procfn)(ibuf[i], &r, nin++);
353
			if(r >= re){
354
				OUT(out, ob, r-ob);
355
				r = ob;
356
			}
357
		}
358
		if(r > ob){
359
			OUT(out, ob, r-ob);
360
			r = ob;
361
		}
362
	}
363
	(*procfn)(-1, &r, nin);
364
	if(r > ob)
365
		OUT(out, ob, r-ob);
366
	OUT(out, ob, 0);
367
}
368
 
369
void
370
jis_in(int fd, long *notused, struct convert *out)
371
{
372
	USED(notused);
373
	do_in(fd, alljis, out);
374
}
375
 
376
void
377
ujis_in(int fd, long *notused, struct convert *out)
378
{
379
	USED(notused);
380
	do_in(fd, ujis, out);
381
}
382
 
383
void
384
msjis_in(int fd, long *notused, struct convert *out)
385
{
386
	USED(notused);
387
	do_in(fd, ms, out);
388
}
389
 
390
void
391
jisjis_in(int fd, long *notused, struct convert *out)
392
{
393
	USED(notused);
394
	do_in(fd, jis, out);
395
}
396
 
397
static int first = 1;
398
 
399
static void
400
tab_init(void)
401
{
402
	int i;
403
	long l;
404
 
405
	first = 0;
406
	for(i = 0; i < NRUNE; i++)
407
		tab[i] = -1;
408
	for(i = 0; i < KUTEN208MAX; i++)
409
		if((l = tabkuten208[i]) != -1){
410
			if(l < 0)
411
				tab[-l] = i;
412
			else
413
				tab[l] = i;
414
		}
415
}
416
 
417
 
418
/*	jis-kanji, or ISO 2022-JP	*/
419
void
420
jisjis_out(Rune *base, int n, long *notused)
421
{
422
	char *p;
423
	int i;
424
	Rune r;
425
	static enum { ascii, japan646, jp2022 } state = ascii;
426
 
427
	USED(notused);
428
	if(first)
429
		tab_init();
430
	nrunes += n;
431
	p = obuf;
432
	for(i = 0; i < n; i++){
433
		r = base[i];
434
		if(r < 128){
435
			if(state == jp2022){
436
				*p++ = ESC; *p++ = '('; *p++ = 'B';
437
				state = ascii;
438
			}
439
			*p++ = r;
440
		} else {
441
			if(tab[r] != -1){
442
				if(state != jp2022){
443
					*p++ = ESC; *p++ = '$'; *p++ = 'B';
444
					state = jp2022;
445
				}
446
				*p++ = tab[r]/100 + ' ';
447
				*p++ = tab[r]%100 + ' ';
448
				continue;
449
			}
450
			if(squawk)
451
				EPR "%s: rune 0x%x not in output cs\n", argv0, r);
452
			nerrors++;
453
			if(clean)
454
				continue;
455
			*p++ = BYTEBADMAP;
456
		}
457
	}
458
	noutput += p-obuf;
459
	if(p > obuf)
460
		write(1, obuf, p-obuf);
461
}
462
 
463
/*	ms-kanji, or Shift-JIS	*/
464
void
465
msjis_out(Rune *base, int n, long *notused)
466
{
467
	char *p;
468
	int i, hi, lo;
469
	Rune r;
470
 
471
	USED(notused);
472
	if(first)
473
		tab_init();
474
	nrunes += n;
475
	p = obuf;
476
	for(i = 0; i < n; i++){
477
		r = base[i];
478
		if(r < 128)
479
			*p++ = r;
480
		else {
481
			if(tab[r] != -1){
482
				hi = tab[r]/100 + ' ';
483
				lo = tab[r]%100 + ' ';
484
				J2S(hi, lo);
485
				*p++ = hi;
486
				*p++ = lo;
487
				continue;
488
			}
489
			if(squawk)
490
				EPR "%s: rune 0x%x not in output cs\n", argv0, r);
491
			nerrors++;
492
			if(clean)
493
				continue;
494
			*p++ = BYTEBADMAP;
495
		}
496
	}
497
	noutput += p-obuf;
498
	if(p > obuf)
499
		write(1, obuf, p-obuf);
500
}
501
 
502
/*	ujis, or EUC	*/
503
void
504
ujis_out(Rune *base, int n, long *notused)
505
{
506
	char *p;
507
	int i;
508
	Rune r;
509
 
510
	USED(notused);
511
	if(first)
512
		tab_init();
513
	nrunes += n;
514
	p = obuf;
515
	for(i = 0; i < n; i++){
516
		r = base[i];
517
		if(r < 128)
518
			*p++ = r;
519
		else {
520
			if(tab[r] != -1){
521
				*p++ = 0x80 | (tab[r]/100 + ' ');
522
				*p++ = 0x80 | (tab[r]%100 + ' ');
523
				continue;
524
			}
525
			if(squawk)
526
				EPR "%s: rune 0x%x not in output cs\n", argv0, r);
527
			nerrors++;
528
			if(clean)
529
				continue;
530
			*p++ = BYTEBADMAP;
531
		}
532
	}
533
	noutput += p-obuf;
534
	if(p > obuf)
535
		write(1, obuf, p-obuf);
536
}