Subversion Repositories planix.SVN

Rev

Rev 2 | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
2 - 1
/*
2
 * chartrans.c
3
 * Copyright (C) 1999-2004 A.J. van Os; Released under GNU GPL
4
 *
5
 * Description:
6
 * Translate Word characters to local representation
7
 */
8
 
9
#include <stdlib.h>
10
#include <string.h>
11
#include <ctype.h>
12
#if defined(__STDC_ISO_10646__)
13
#include <wctype.h>
14
#endif /* __STDC_ISO_10646__ */
15
#include "antiword.h"
16
 
17
static const USHORT usCp850[] = {	/* DOS implementation of Latin1 */
18
	0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7,
19
	0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5,
20
	0x00c9, 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9,
21
	0x00ff, 0x00d6, 0x00dc, 0x00f8, 0x00a3, 0x00d8, 0x00d7, 0x0192,
22
	0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba,
23
	0x00bf, 0x00ae, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb,
24
	0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x00c1, 0x00c2, 0x00c0,
25
	0x00a9, 0x2563, 0x2551, 0x2557, 0x255d, 0x00a2, 0x00a5, 0x2510,
26
	0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x00e3, 0x00c3,
27
	0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x00a4,
28
	0x00f0, 0x00d0, 0x00ca, 0x00cb, 0x00c8, 0x0131, 0x00cd, 0x00ce,
29
	0x00cf, 0x2518, 0x250c, 0x2588, 0x2584, 0x00a6, 0x00cc, 0x2580,
30
	0x00d3, 0x00df, 0x00d4, 0x00d2, 0x00f5, 0x00d5, 0x00b5, 0x00fe,
31
	0x00de, 0x00da, 0x00db, 0x00d9, 0x00fd, 0x00dd, 0x00af, 0x00b4,
32
	0x00ad, 0x00b1, 0x2017, 0x00be, 0x00b6, 0x00a7, 0x00f7, 0x00b8,
33
	0x00b0, 0x00a8, 0x00b7, 0x00b9, 0x00b3, 0x00b2, 0x25a0, 0x00a0,
34
};
35
 
36
static const USHORT usCp1250[] = {	/* Windows implementation of Latin2 */
37
	0x20ac, 0x003f, 0x201a, 0x003f, 0x201e, 0x2026, 0x2020, 0x2021,
38
	0x003f, 0x2030, 0x0160, 0x2039, 0x015a, 0x0164, 0x017d, 0x0179,
39
	0x003f, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
40
	0x003f, 0x2122, 0x0161, 0x203a, 0x015b, 0x0165, 0x017e, 0x017a,
41
	0x00a0, 0x02c7, 0x02d8, 0x0141, 0x00a4, 0x0104, 0x00a6, 0x00a7,
42
	0x00a8, 0x00a9, 0x015e, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x017b,
43
	0x00b0, 0x00b1, 0x02db, 0x0142, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
44
	0x00b8, 0x0105, 0x015f, 0x00bb, 0x013d, 0x02dd, 0x013e, 0x017c,
45
	0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7,
46
	0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e,
47
	0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7,
48
	0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df,
49
	0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7,
50
	0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f,
51
	0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7,
52
	0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9,
53
};
54
 
55
static const USHORT usCp1251[] = {	/* Windows implementation of Cyrillic */
56
	0x0402, 0x0403, 0x201a, 0x0453, 0x201e, 0x2026, 0x2020, 0x2021,
57
	0x20ac, 0x2030, 0x0409, 0x2039, 0x040a, 0x040c, 0x040b, 0x040f,
58
	0x0452, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
59
	0x00f3, 0x2122, 0x0459, 0x203a, 0x045a, 0x045c, 0x045b, 0x045f,
60
	0x00a0, 0x040e, 0x045e, 0x0408, 0x00a4, 0x0490, 0x00a6, 0x00a7,
61
	0x0401, 0x00a9, 0x0404, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x0407,
62
	0x00b0, 0x00b1, 0x0406, 0x0456, 0x0491, 0x00b5, 0x00b6, 0x00b7,
63
	0x0451, 0x2116, 0x0454, 0x00bb, 0x0458, 0x0405, 0x0455, 0x0457,
64
	0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,
65
	0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,
66
	0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,
67
	0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,
68
	0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437,
69
	0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f,
70
	0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447,
71
	0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f,
72
};
73
 
74
static const USHORT usCp1252[] = {	/* Windows implementation of Latin1 */
75
	0x20ac, 0x003f, 0x201a, 0x0192, 0x201e, 0x2026, 0x2020, 0x2021,
76
	0x02c6, 0x2030, 0x0160, 0x2039, 0x0152, 0x003f, 0x017d, 0x003f,
77
	0x003f, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014,
78
	0x02dc, 0x2122, 0x0161, 0x203a, 0x0153, 0x003f, 0x017e, 0x0178,
79
	0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7,
80
	0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af,
81
	0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7,
82
	0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf,
83
	0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
84
	0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
85
	0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7,
86
	0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df,
87
	0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7,
88
	0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef,
89
	0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7,
90
	0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff,
91
};
92
 
93
static const USHORT usMacRoman[] = {	/* Apple implementation of Latin1 */
94
	0x00c4, 0x00c5, 0x00c7, 0x00c9, 0x00d1, 0x00d6, 0x00dc, 0x00e1,
95
	0x00e0, 0x00e2, 0x00e4, 0x00e3, 0x00e5, 0x00e7, 0x00e9, 0x00e8,
96
	0x00ea, 0x00eb, 0x00ed, 0x00ec, 0x00ee, 0x00ef, 0x00f1, 0x00f3,
97
	0x00f2, 0x00f4, 0x00f6, 0x00f5, 0x00fa, 0x00f9, 0x00fb, 0x00fc,
98
	0x2020, 0x00b0, 0x00a2, 0x00a3, 0x00a7, 0x2022, 0x00b6, 0x00df,
99
	0x00ae, 0x00a9, 0x2122, 0x00b4, 0x00a8, 0x2260, 0x00c6, 0x00d8,
100
	0x221e, 0x00b1, 0x2264, 0x2265, 0x00a5, 0x00b5, 0x2202, 0x2211,
101
	0x220f, 0x03c0, 0x222b, 0x00aa, 0x00ba, 0x2126, 0x00e6, 0x00f8,
102
	0x00bf, 0x00a1, 0x00ac, 0x221a, 0x0192, 0x2248, 0x2206, 0x00ab,
103
	0x00bb, 0x2026, 0x00a0, 0x00c0, 0x00c3, 0x00d5, 0x0152, 0x0153,
104
	0x2013, 0x2014, 0x201c, 0x201d, 0x2018, 0x2019, 0x00f7, 0x25ca,
105
	0x00ff, 0x0178, 0x2044, 0x00a4, 0x2039, 0x203a, 0xfb01, 0xfb02,
106
	0x2021, 0x00b7, 0x201a, 0x201e, 0x2030, 0x00c2, 0x00ca, 0x00c1,
107
	0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf, 0x00cc, 0x00d3, 0x00d4,
108
	0x003f, 0x00d2, 0x00da, 0x00db, 0x00d9, 0x0131, 0x02c6, 0x02dc,
109
	0x00af, 0x02d8, 0x02d9, 0x02da, 0x00b8, 0x02dd, 0x02db, 0x02c7,
110
};
111
 
112
static const USHORT usPrivateArea[] = {
113
	0x0020, 0x0021, 0x2200, 0x0023, 0x2203, 0x0025, 0x0026, 0x220d,
114
	0x0028, 0x0029, 0x2217, 0x002b, 0x002c, 0x2212, 0x002e, 0x002f,
115
	0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037,
116
	0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x2019, 0x003e, 0x003f,
117
	0x201d, 0x201c, 0x0392, 0x03a7, 0x0394, 0x0395, 0x03a6, 0x0393,
118
	0x0397, 0x0399, 0x03d1, 0x039a, 0x039b, 0x039c, 0x039d, 0x039f,
119
	0x03a0, 0x0398, 0x03a1, 0x03a3, 0x03a4, 0x03a5, 0x03c2, 0x03a9,
120
	0x039e, 0x03a8, 0x0396, 0x005b, 0x2234, 0x005d, 0x22a5, 0x005f,
121
	0x003f, 0x03b1, 0x03b2, 0x03c7, 0x03b4, 0x03b5, 0x03c6, 0x03b3,
122
	0x03b7, 0x03b9, 0x03d5, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03bf,
123
	0x03c0, 0x03b8, 0x03c1, 0x03c3, 0x03c4, 0x03c5, 0x03d6, 0x03c9,
124
	0x03be, 0x03c8, 0x03b6, 0x007b, 0x007c, 0x007d, 0x223c, 0x003f,
125
	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
126
	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
127
	0x003f, 0x003f, 0x003f, 0x2022, 0x003f, 0x003f, 0x003f, 0x003f,
128
	0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f, 0x003f,
129
	0x20ac, 0x03d2, 0x2032, 0x2264, 0x2044, 0x221e, 0x0192, 0x2663,
130
	0x2666, 0x2665, 0x2660, 0x2194, 0x2190, 0x2191, 0x2192, 0x2193,
131
	0x00b0, 0x00b1, 0x2033, 0x2265, 0x00d7, 0x221d, 0x2202, 0x2022,
132
	0x00f7, 0x2260, 0x2261, 0x2248, 0x2026, 0x007c, 0x23af, 0x21b5,
133
	0x2135, 0x2111, 0x211c, 0x2118, 0x2297, 0x2295, 0x2205, 0x2229,
134
	0x222a, 0x2283, 0x2287, 0x2284, 0x2282, 0x2286, 0x2208, 0x2209,
135
	0x2220, 0x2207, 0x00ae, 0x00a9, 0x2122, 0x220f, 0x221a, 0x22c5,
136
	0x00ac, 0x2227, 0x2228, 0x21d4, 0x21d0, 0x21d1, 0x21d2, 0x21d3,
137
	0x22c4, 0x3008, 0x00ae, 0x00a9, 0x2122, 0x2211, 0x239b, 0x239c,
138
	0x239d, 0x23a1, 0x23a2, 0x23a3, 0x23a7, 0x23a8, 0x23a9, 0x23aa,
139
	0x003f, 0x3009, 0x222b, 0x2320, 0x23ae, 0x2321, 0x239e, 0x239f,
140
	0x23a0, 0x23a4, 0x23a5, 0x23a6, 0x23ab, 0x23ac, 0x23ad, 0x003f,
141
};
142
 
143
typedef struct char_table_tag {
144
	UCHAR	ucLocal;
145
	USHORT	usUnicode;
146
} char_table_type;
147
 
148
static char_table_type	atCharTable[256];
149
static size_t		tNextPosFree = 0;
150
 
151
 
152
/*
153
 * iCompare - compare two records
154
 *
155
 * Compares two records. For use by qsort(3C) and bsearch(3C).
156
 *
157
 * returns -1 if rec1 < rec2, 0 if rec1 == rec2, 1 if rec1 > rec2
158
 */
159
static int
160
iCompare(const void *pvRecord1, const void *pvRecord2)
161
{
162
	USHORT	usUnicode1, usUnicode2;
163
 
164
	usUnicode1 = ((char_table_type *)pvRecord1)->usUnicode;
165
	usUnicode2 = ((char_table_type *)pvRecord2)->usUnicode;
166
 
167
	if (usUnicode1 < usUnicode2) {
168
		return -1;
169
	}
170
	if (usUnicode1 > usUnicode2) {
171
		return 1;
172
	}
173
	return 0;
174
} /* end of iCompare */
175
 
176
/*
177
 * pGetCharTableRecord - get the character table record
178
 *
179
 * returns a pointer to the record when found, otherwise NULL
180
 */
181
static const char_table_type *
182
pGetCharTableRecord(USHORT usUnicode)
183
{
184
	char_table_type	tKey;
185
 
186
	if (tNextPosFree == 0) {
187
		return NULL;
188
	}
189
	tKey.usUnicode = usUnicode;
190
	tKey.ucLocal = 0;
191
	return (char_table_type *)bsearch(&tKey,
192
			atCharTable,
193
			tNextPosFree, sizeof(atCharTable[0]),
194
			iCompare);
195
} /* end of pGetCharTableRecord */
196
 
197
/*
198
 * ucGetBulletCharacter - get the local representation of the bullet
199
 */
200
UCHAR
201
ucGetBulletCharacter(conversion_type eConversionType, encoding_type eEncoding)
202
{
203
#if defined(__riscos)
204
	return 0x8f;
205
#else
206
	const char_table_type	*pRec;
207
 
208
	fail(eEncoding == encoding_utf_8);
209
 
210
	if (eEncoding == encoding_latin_1 &&
211
	    (eConversionType == conversion_ps ||
212
	     eConversionType == conversion_pdf)) {
213
		/* Ugly, but it makes the PostScript and PDF look better */
214
		return (UCHAR)143;
215
	}
216
	if (eConversionType != conversion_text &&
217
	    eConversionType != conversion_fmt_text) {
218
		pRec = pGetCharTableRecord(UNICODE_BULLET);
219
		if (pRec != NULL) {
220
			return pRec->ucLocal;
221
		}
222
		pRec = pGetCharTableRecord(UNICODE_BULLET_OPERATOR);
223
		if (pRec != NULL) {
224
			return pRec->ucLocal;
225
		}
226
		pRec = pGetCharTableRecord(UNICODE_MIDDLE_DOT);
227
		if (pRec != NULL) {
228
			return pRec->ucLocal;
229
		}
230
	}
231
	return (UCHAR)'.';
232
#endif /* __riscos */
233
} /* end of ucGetBulletCharacter */
234
 
235
/*
236
 * ucGetNbspCharacter - get the local representation of the non-breaking space
237
 */
238
UCHAR
239
ucGetNbspCharacter(void)
240
{
241
	const char_table_type	*pRec;
242
 
243
	pRec = pGetCharTableRecord(0x00a0);	/* Unicode non-breaking space */
244
	if (pRec == NULL) {
245
		DBG_MSG("Non-breaking space record not found");
246
		/* No value found, use the best guess */
247
		return (UCHAR)0xa0;
248
	}
249
	return pRec->ucLocal;
250
} /* end of ucGetNbspCharacter */
251
 
252
/*
253
 * bReadCharacterMappingTable - read the mapping table
254
 *
255
 * Read the character mapping table from file and have the contents sorted
256
 *
257
 * returns TRUE if successful, otherwise FALSE
258
 */
259
BOOL
260
bReadCharacterMappingTable(FILE *pFile)
261
{
262
	char	*pcTmp;
263
	ULONG	ulUnicode;
264
	UINT	uiLocal;
265
	int	iFields;
266
	char	szLine[81];
267
 
268
	if (pFile == NULL) {
269
		return FALSE;
270
	}
271
 
272
	/* Clean the table first */
273
	(void)memset(atCharTable, 0, sizeof(atCharTable));
274
 
275
	/* Fill the table */
276
	while (fgets(szLine, (int)sizeof(szLine), pFile)) {
277
		if (szLine[0] == '#' ||
278
		    szLine[0] == '\r' ||
279
		    szLine[0] == '\n') {
280
			/* Comment or empty line */
281
			continue;
282
		}
283
		iFields = sscanf(szLine, "%x %lx %*s", &uiLocal, &ulUnicode);
284
		if (iFields != 2) {
285
			pcTmp = strchr(szLine, '\r');
286
			if (pcTmp != NULL) {
287
				*pcTmp = '\0';
288
			}
289
			pcTmp = strchr(szLine, '\n');
290
			if (pcTmp != NULL) {
291
				*pcTmp = '\0';
292
			}
293
			werr(0, "Syntax error in: '%s'", szLine);
294
			continue;
295
		}
296
		if (uiLocal > 0xff || ulUnicode > 0xffff) {
297
			werr(0, "Syntax error in: '%02x %04lx'",
298
					uiLocal, ulUnicode);
299
			continue;
300
		}
301
		/* Store only the relevant entries */
302
		if (uiLocal != ulUnicode || uiLocal >= 0x80) {
303
			atCharTable[tNextPosFree].ucLocal = (UCHAR)uiLocal;
304
			atCharTable[tNextPosFree].usUnicode = (USHORT)ulUnicode;
305
			tNextPosFree++;
306
		}
307
		if (tNextPosFree >= elementsof(atCharTable)) {
308
			werr(0, "Too many entries in the character mapping "
309
				"file. Ignoring the rest.");
310
			break;
311
		}
312
	}
313
 
314
	if (tNextPosFree != 0) {
315
		DBG_HEX(atCharTable[0].usUnicode);
316
		DBG_HEX(atCharTable[tNextPosFree - 1].usUnicode);
317
 
318
		qsort(atCharTable,
319
			tNextPosFree, sizeof(atCharTable[0]),
320
			iCompare);
321
 
322
		DBG_HEX(atCharTable[0].usUnicode);
323
		DBG_HEX(atCharTable[tNextPosFree - 1].usUnicode);
324
	}
325
 
326
	return TRUE;
327
} /* end of bReadCharacterMappingTable */
328
 
329
/*
330
 * ulTranslateCharacters - Translate characters to local representation
331
 *
332
 * Translate all characters to local representation
333
 *
334
 * returns the translated character
335
 */
336
ULONG
337
ulTranslateCharacters(USHORT usChar, ULONG ulFileOffset, int iWordVersion,
338
	conversion_type eConversionType, encoding_type eEncoding,
339
	BOOL bUseMacCharSet)
340
{
341
	const char_table_type	*pTmp;
342
	const USHORT	*usCharSet;
343
 
344
	usCharSet = NULL;
345
	if (bUseMacCharSet) {
346
		/* Macintosh character set */
347
		usCharSet = usMacRoman;
348
	} else if (iWordVersion == 0) {
349
		/* DOS character set */
350
		usCharSet = usCp850;
351
	} else {
352
		/* Windows character set */
353
		switch (eEncoding) {
354
		case encoding_latin_2:
355
			usCharSet = usCp1250;
356
			break;
357
		case encoding_cyrillic:
358
			usCharSet = usCp1251;
359
			break;
360
		case encoding_latin_1:
361
		default:
362
			usCharSet = usCp1252;
363
			break;
364
		}
365
	}
366
	fail(usCharSet == NULL);
367
	if (usChar >= 0x80 && usChar <= 0x9f) {
368
		/* Translate implementation defined characters */
369
		usChar = usCharSet[usChar - 0x80];
370
	} else if (iWordVersion < 8 && usChar >= 0xa0 && usChar <= 0xff) {
371
		/* Translate old character set to Unixcode */
372
		usChar = usCharSet[usChar - 0x80];
373
	}
374
 
375
	/* Microsoft Unicode to real Unicode */
376
	if (usChar >= 0xf020 && usChar <= 0xf0ff) {
377
		DBG_HEX_C(usPrivateArea[usChar - 0xf020] == 0x003f, usChar);
378
		usChar = usPrivateArea[usChar - 0xf020];
379
	}
380
 
381
	/* Characters with a special meaning in Word */
382
	switch (usChar) {
383
	case IGNORE_CHARACTER:
384
	case FOOTNOTE_SEPARATOR:
385
	case FOOTNOTE_CONTINUATION:
386
	case ANNOTATION:
387
	case FRAME:
388
	case LINE_FEED:
389
	case WORD_SOFT_HYPHEN:
390
	case UNICODE_HYPHENATION_POINT:
391
		return IGNORE_CHARACTER;
392
	case PICTURE:
393
	case TABLE_SEPARATOR:
394
	case TAB:
395
	case HARD_RETURN:
396
	case PAGE_BREAK:
397
	case PAR_END:
398
	case COLUMN_FEED:
399
		return (ULONG)usChar;
400
	case FOOTNOTE_OR_ENDNOTE:
401
		NO_DBG_HEX(ulFileOffset);
402
		switch (eGetNotetype(ulFileOffset)) {
403
		case notetype_is_footnote:
404
			return FOOTNOTE_CHAR;
405
		case notetype_is_endnote:
406
			return ENDNOTE_CHAR;
407
		default:
408
			return UNKNOWN_NOTE_CHAR;
409
		}
410
	case WORD_UNBREAKABLE_JOIN:
411
		return (ULONG)OUR_UNBREAKABLE_JOIN;
412
	default:
413
		break;
414
	}
415
 
416
	if (eEncoding != encoding_utf_8) {
417
		/* Latin characters in an oriental text */
418
		if (usChar >= 0xff01 && usChar <= 0xff5e) {
419
			usChar -= 0xfee0;
420
		}
421
	}
422
 
423
	if (eEncoding == encoding_latin_1 &&
424
	    (eConversionType == conversion_ps ||
425
	     eConversionType == conversion_pdf)) {
426
		/* Ugly, but it makes the PostScript and PDF look better */
427
		switch (usChar) {
428
		case UNICODE_ELLIPSIS:
429
			return 140;
430
		case UNICODE_TRADEMARK_SIGN:
431
			return 141;
432
		case UNICODE_PER_MILLE_SIGN:
433
			return 142;
434
		case UNICODE_BULLET:
435
		case UNICODE_BULLET_OPERATOR:
436
		case UNICODE_BLACK_CLUB_SUIT:
437
			return 143;
438
		case UNICODE_LEFT_SINGLE_QMARK:
439
			return 144;
440
		case UNICODE_RIGHT_SINGLE_QMARK:
441
			return 145;
442
		case UNICODE_SINGLE_LEFT_ANGLE_QMARK:
443
			return 146;
444
		case UNICODE_SINGLE_RIGHT_ANGLE_QMARK:
445
			return 147;
446
		case UNICODE_LEFT_DOUBLE_QMARK:
447
			return 148;
448
		case UNICODE_RIGHT_DOUBLE_QMARK:
449
			return 149;
450
		case UNICODE_DOUBLE_LOW_9_QMARK:
451
			return 150;
452
		case UNICODE_EN_DASH:
453
			return 151;
454
		case UNICODE_EM_DASH:
455
			return 152;
456
		case UNICODE_MINUS_SIGN:
457
			return 153;
458
		case UNICODE_CAPITAL_LIGATURE_OE:
459
			return 154;
460
		case UNICODE_SMALL_LIGATURE_OE:
461
			return 155;
462
		case UNICODE_DAGGER:
463
			return 156;
464
		case UNICODE_DOUBLE_DAGGER:
465
			return 157;
466
		case UNICODE_SMALL_LIGATURE_FI:
467
			return 158;
468
		case UNICODE_SMALL_LIGATURE_FL:
469
			return 159;
470
		default:
471
			break;
472
		}
473
	}
474
 
475
	if (eConversionType == conversion_pdf) {
476
		if (eEncoding == encoding_latin_1) {
477
			switch (usChar) {
478
			case UNICODE_EURO_SIGN:
479
				return 128;
480
			default:
481
				break;
482
			}
483
		} else if (eEncoding == encoding_latin_2) {
484
			switch (usChar) {
485
			case UNICODE_CAPITAL_D_WITH_STROKE:
486
			case UNICODE_SMALL_D_WITH_STROKE:
487
				return 0x3f;
488
			default:
489
				break;
490
			}
491
		}
492
	}
493
 
494
	if (usChar < 0x80) {
495
		/* US ASCII */
496
		if (usChar < 0x20 || usChar == 0x7f) {
497
			/* Ignore control characters */
498
			DBG_HEX(usChar);
499
			DBG_FIXME();
500
			return IGNORE_CHARACTER;
501
		}
502
		return (ULONG)usChar;
503
	}
504
 
505
	if (eEncoding == encoding_utf_8) {
506
		/* No need to convert Unicode characters */
507
		return (ULONG)usChar;
508
	}
509
 
510
	/* Unicode to local representation */
511
	pTmp = pGetCharTableRecord(usChar);
512
	if (pTmp != NULL) {
513
		DBG_HEX_C(usChar >= 0x7f && usChar <= 0x9f, usChar);
514
		return (ULONG)pTmp->ucLocal;
515
	}
516
 
517
	/* Fancy characters to simple US ASCII */
518
	switch (usChar) {
519
	case UNICODE_SMALL_F_HOOK:
520
		return (ULONG)'f';
521
	case UNICODE_GREEK_CAPITAL_CHI:
522
		return (ULONG)'X';
523
	case UNICODE_GREEK_SMALL_UPSILON:
524
		return (ULONG)'v';
525
	case UNICODE_MODIFIER_CIRCUMFLEX:
526
	case UNICODE_UPWARDS_ARROW:
527
		return (ULONG)'^';
528
	case UNICODE_SMALL_TILDE:
529
	case UNICODE_TILDE_OPERATOR:
530
		return (ULONG)'~';
531
	case UNICODE_EN_QUAD:
532
	case UNICODE_EM_QUAD:
533
	case UNICODE_EN_SPACE:
534
	case UNICODE_EM_SPACE:
535
	case UNICODE_THREE_PER_EM_SPACE:
536
	case UNICODE_FOUR_PER_EM_SPACE:
537
	case UNICODE_SIX_PER_EM_SPACE:
538
	case UNICODE_FIGURE_SPACE:
539
	case UNICODE_PUNCTUATION_SPACE:
540
	case UNICODE_THIN_SPACE:
541
	case UNICODE_NARROW_NO_BREAK_SPACE:
542
	case UNICODE_LIGHT_SHADE:
543
	case UNICODE_MEDIUM_SHADE:
544
	case UNICODE_DARK_SHADE:
545
		return (ULONG)' ';
546
	case UNICODE_LEFT_DOUBLE_QMARK:
547
	case UNICODE_RIGHT_DOUBLE_QMARK:
548
	case UNICODE_DOUBLE_LOW_9_QMARK:
549
	case UNICODE_DOUBLE_HIGH_REV_9_QMARK:
550
	case UNICODE_DOUBLE_PRIME:
551
		return (ULONG)'"';
552
	case UNICODE_LEFT_SINGLE_QMARK:
553
	case UNICODE_RIGHT_SINGLE_QMARK:
554
	case UNICODE_SINGLE_LOW_9_QMARK:
555
	case UNICODE_SINGLE_HIGH_REV_9_QMARK:
556
	case UNICODE_PRIME:
557
		return (ULONG)'\'';
558
	case UNICODE_HYPHEN:
559
	case UNICODE_NON_BREAKING_HYPHEN:
560
	case UNICODE_FIGURE_DASH:
561
	case UNICODE_EN_DASH:
562
	case UNICODE_EM_DASH:
563
	case UNICODE_HORIZONTAL_BAR:
564
	case UNICODE_MINUS_SIGN:
565
	case UNICODE_BD_LIGHT_HORIZONTAL:
566
	case UNICODE_BD_DOUBLE_HORIZONTAL:
567
		return (ULONG)'-';
568
	case UNICODE_DOUBLE_VERTICAL_LINE:
569
	case UNICODE_BD_LIGHT_VERTICAL:
570
	case UNICODE_BD_DOUBLE_VERTICAL:
571
		return (ULONG)'|';
572
	case UNICODE_DOUBLE_LOW_LINE:
573
		return (ULONG)'_';
574
	case UNICODE_DAGGER:
575
		return (ULONG)'+';
576
	case UNICODE_DOUBLE_DAGGER:
577
		return (ULONG)'#';
578
	case UNICODE_BULLET:
579
	case UNICODE_BULLET_OPERATOR:
580
	case UNICODE_BLACK_CLUB_SUIT:
581
		return (ULONG)ucGetBulletCharacter(eConversionType, eEncoding);
582
	case UNICODE_ONE_DOT_LEADER:
583
	case UNICODE_TWO_DOT_LEADER:
584
		return (ULONG)'.';
585
	case UNICODE_ELLIPSIS:
586
#if defined(__riscos)
587
		return (ULONG)OUR_ELLIPSIS;
588
#else
589
		if (ulFileOffset == 0) {
590
			return (ULONG)OUR_ELLIPSIS;
591
		}
592
		return UNICODE_ELLIPSIS;
593
#endif /* __riscos */
594
	case UNICODE_DOUBLE_LEFT_ANGLE_QMARK:
595
	case UNICODE_TRIANGULAR_BULLET:
596
	case UNICODE_SINGLE_LEFT_ANGLE_QMARK:
597
	case UNICODE_LEFTWARDS_ARROW:
598
		return (ULONG)'<';
599
	case UNICODE_DOUBLE_RIGHT_ANGLE_QMARK:
600
	case UNICODE_SINGLE_RIGHT_ANGLE_QMARK:
601
	case UNICODE_RIGHTWARDS_ARROW:
602
		return (ULONG)'>';
603
	case UNICODE_UNDERTIE:
604
		return (ULONG)'-';
605
	case UNICODE_N_ARY_SUMMATION:
606
		return (ULONG)'S';
607
	case UNICODE_EURO_SIGN:
608
		return (ULONG)'E';
609
	case UNICODE_CIRCLE:
610
	case UNICODE_SQUARE:
611
		return (ULONG)'O';
612
	case UNICODE_DIAMOND:
613
		return (ULONG)OUR_DIAMOND;
614
	case UNICODE_NUMERO_SIGN:
615
		return (ULONG)'N';
616
	case UNICODE_KELVIN_SIGN:
617
		return (ULONG)'K';
618
	case UNICODE_DOWNWARDS_ARROW:
619
		return (ULONG)'v';
620
	case UNICODE_FRACTION_SLASH:
621
	case UNICODE_DIVISION_SLASH:
622
		return (ULONG)'/';
623
	case UNICODE_ASTERISK_OPERATOR:
624
		return (ULONG)'*';
625
	case UNICODE_RATIO:
626
		return (ULONG)':';
627
	case UNICODE_BD_LIGHT_DOWN_RIGHT:
628
	case UNICODE_BD_LIGHT_DOWN_AND_LEFT:
629
	case UNICODE_BD_LIGHT_UP_AND_RIGHT:
630
	case UNICODE_BD_LIGHT_UP_AND_LEFT:
631
	case UNICODE_BD_LIGHT_VERTICAL_AND_RIGHT:
632
	case UNICODE_BD_LIGHT_VERTICAL_AND_LEFT:
633
	case UNICODE_BD_LIGHT_DOWN_AND_HORIZONTAL:
634
	case UNICODE_BD_LIGHT_UP_AND_HORIZONTAL:
635
	case UNICODE_BD_LIGHT_VERTICAL_AND_HORIZONTAL:
636
	case UNICODE_BD_DOUBLE_DOWN_AND_RIGHT:
637
	case UNICODE_BD_DOUBLE_DOWN_AND_LEFT:
638
	case UNICODE_BD_DOUBLE_UP_AND_RIGHT:
639
	case UNICODE_BD_DOUBLE_UP_AND_LEFT:
640
	case UNICODE_BD_DOUBLE_VERTICAL_AND_RIGHT:
641
	case UNICODE_BD_DOUBLE_VERTICAL_AND_LEFT:
642
	case UNICODE_BD_DOUBLE_DOWN_AND_HORIZONTAL:
643
	case UNICODE_BD_DOUBLE_UP_AND_HORIZONTAL:
644
	case UNICODE_BD_DOUBLE_VERTICAL_AND_HORIZONTAL:
645
	case UNICODE_BLACK_SQUARE:
646
		return (ULONG)'+';
647
	case UNICODE_HAIR_SPACE:
648
	case UNICODE_ZERO_WIDTH_SPACE:
649
	case UNICODE_ZERO_WIDTH_NON_JOINER:
650
	case UNICODE_ZERO_WIDTH_JOINER:
651
	case UNICODE_LEFT_TO_RIGHT_MARK:
652
	case UNICODE_RIGHT_TO_LEFT_MARK:
653
	case UNICODE_LEFT_TO_RIGHT_EMBEDDING:
654
	case UNICODE_RIGHT_TO_LEFT_EMBEDDING:
655
	case UNICODE_POP_DIRECTIONAL_FORMATTING:
656
	case UNICODE_LEFT_TO_RIGHT_OVERRIDE:
657
	case UNICODE_RIGHT_TO_LEFT_OVERRIDE:
658
	case UNICODE_ZERO_WIDTH_NO_BREAK_SPACE:
659
		return IGNORE_CHARACTER;
660
	default:
661
		break;
662
	}
663
 
664
	if (usChar == UNICODE_TRADEMARK_SIGN) {
665
		/*
666
		 * No local representation, it doesn't look like anything in
667
		 * US-ASCII and a question mark does more harm than good.
668
		 */
669
		return IGNORE_CHARACTER;
670
	}
671
 
672
	if (usChar >= 0xa0 && usChar <= 0xff) {
673
		/* Before Word 97, Word did't use Unicode */
674
		return (ULONG)usChar;
675
	}
676
 
677
	DBG_HEX_C(usChar < 0x3000 || usChar >= 0xd800, ulFileOffset);
678
	DBG_HEX_C(usChar < 0x3000 || usChar >= 0xd800, usChar);
679
	DBG_MSG_C(usChar >= 0xe000 && usChar < 0xf900, "Private Use Area");
680
 
681
	/* Untranslated Unicode character */
682
	return 0x3f;
683
} /* end of ulTranslateCharacters */
684
 
685
/*
686
 * ulToUpper - convert letter to upper case
687
 *
688
 * This function converts a letter to upper case. Unlike toupper(3) this
689
 * function is independent from the settings of locale. This comes in handy
690
 * for people who have to read Word documents in more than one language or
691
 * contain more than one language.
692
 *
693
 * returns the converted letter, or ulChar if the conversion was not possible.
694
 */
695
ULONG
696
ulToUpper(ULONG ulChar)
697
{
698
	if (ulChar < 0x80) {
699
		/* US ASCII: use standard function */
700
		return (ULONG)toupper((int)ulChar);
701
	}
702
	if (ulChar >= 0xe0 && ulChar <= 0xfe && ulChar != 0xf7) {
703
		/*
704
		 * Lower case accented characters
705
		 * 0xf7 is Division sign; 0xd7 is Multiplication sign
706
		 * 0xff is y with diaeresis; 0xdf is Sharp s
707
		 */
708
		return ulChar & ~0x20;
709
	}
710
#if defined(__STDC_ISO_10646__)
711
	/*
712
	 * If this is ISO C99 and all locales have wchar_t = ISO 10646
713
	 * (e.g., glibc 2.2 or newer), then use standard function
714
	 */
715
	if (ulChar > 0xff) {
716
		return (ULONG)towupper((wint_t)ulChar);
717
	}
718
#endif /* __STDC_ISO_10646__ */
719
	return ulChar;
720
} /* end of ulToUpper */