WebSVN – planix.SVN – Blame – /os/trunk/sys/src/cmd/aux/antiword/utf8.c

Rev	Author	Line No.	Line
2	-	1	`/*`
		2	`* utf8.c`
		3	`* Copyright (C) 2001-2004 A.J. van Os; Released under GPL`
		4	`*`
		5	`*====================================================================`
		6	`* This part of the software is based on:`
		7	`* An implementation of wcwidth() as defined in`
		8	`* "The Single UNIX Specification, Version 2, The Open Group, 1997"`
		9	`* <http://www.UNIX-systems.org/online.html>`
		10	`* Markus Kuhn -- 2001-01-12 -- public domain`
		11	`*====================================================================`
		12	`* The credit should go to him, but all the bugs are mine.`
		13	`*/`
		14
		15	`#include <stdlib.h>`
		16	`#include <string.h>`
		17	`#include "antiword.h"`
		18
		19	`struct interval {`
		20	`USHORT first;`
		21	`USHORT last;`
		22	`};`
		23	`/* Sorted list of non-overlapping intervals of non-spacing characters */`
		24	`static const struct interval combining[] = {`
		25	`{ 0x0300, 0x034E }, { 0x0360, 0x0362 }, { 0x0483, 0x0486 },`
		26	`{ 0x0488, 0x0489 }, { 0x0591, 0x05A1 }, { 0x05A3, 0x05B9 },`
		27	`{ 0x05BB, 0x05BD }, { 0x05BF, 0x05BF }, { 0x05C1, 0x05C2 },`
		28	`{ 0x05C4, 0x05C4 }, { 0x064B, 0x0655 }, { 0x0670, 0x0670 },`
		29	`{ 0x06D6, 0x06E4 }, { 0x06E7, 0x06E8 }, { 0x06EA, 0x06ED },`
		30	`{ 0x070F, 0x070F }, { 0x0711, 0x0711 }, { 0x0730, 0x074A },`
		31	`{ 0x07A6, 0x07B0 }, { 0x0901, 0x0902 }, { 0x093C, 0x093C },`
		32	`{ 0x0941, 0x0948 }, { 0x094D, 0x094D }, { 0x0951, 0x0954 },`
		33	`{ 0x0962, 0x0963 }, { 0x0981, 0x0981 }, { 0x09BC, 0x09BC },`
		34	`{ 0x09C1, 0x09C4 }, { 0x09CD, 0x09CD }, { 0x09E2, 0x09E3 },`
		35	`{ 0x0A02, 0x0A02 }, { 0x0A3C, 0x0A3C }, { 0x0A41, 0x0A42 },`
		36	`{ 0x0A47, 0x0A48 }, { 0x0A4B, 0x0A4D }, { 0x0A70, 0x0A71 },`
		37	`{ 0x0A81, 0x0A82 }, { 0x0ABC, 0x0ABC }, { 0x0AC1, 0x0AC5 },`
		38	`{ 0x0AC7, 0x0AC8 }, { 0x0ACD, 0x0ACD }, { 0x0B01, 0x0B01 },`
		39	`{ 0x0B3C, 0x0B3C }, { 0x0B3F, 0x0B3F }, { 0x0B41, 0x0B43 },`
		40	`{ 0x0B4D, 0x0B4D }, { 0x0B56, 0x0B56 }, { 0x0B82, 0x0B82 },`
		41	`{ 0x0BC0, 0x0BC0 }, { 0x0BCD, 0x0BCD }, { 0x0C3E, 0x0C40 },`
		42	`{ 0x0C46, 0x0C48 }, { 0x0C4A, 0x0C4D }, { 0x0C55, 0x0C56 },`
		43	`{ 0x0CBF, 0x0CBF }, { 0x0CC6, 0x0CC6 }, { 0x0CCC, 0x0CCD },`
		44	`{ 0x0D41, 0x0D43 }, { 0x0D4D, 0x0D4D }, { 0x0DCA, 0x0DCA },`
		45	`{ 0x0DD2, 0x0DD4 }, { 0x0DD6, 0x0DD6 }, { 0x0E31, 0x0E31 },`
		46	`{ 0x0E34, 0x0E3A }, { 0x0E47, 0x0E4E }, { 0x0EB1, 0x0EB1 },`
		47	`{ 0x0EB4, 0x0EB9 }, { 0x0EBB, 0x0EBC }, { 0x0EC8, 0x0ECD },`
		48	`{ 0x0F18, 0x0F19 }, { 0x0F35, 0x0F35 }, { 0x0F37, 0x0F37 },`
		49	`{ 0x0F39, 0x0F39 }, { 0x0F71, 0x0F7E }, { 0x0F80, 0x0F84 },`
		50	`{ 0x0F86, 0x0F87 }, { 0x0F90, 0x0F97 }, { 0x0F99, 0x0FBC },`
		51	`{ 0x0FC6, 0x0FC6 }, { 0x102D, 0x1030 }, { 0x1032, 0x1032 },`
		52	`{ 0x1036, 0x1037 }, { 0x1039, 0x1039 }, { 0x1058, 0x1059 },`
		53	`{ 0x1160, 0x11FF }, { 0x17B7, 0x17BD }, { 0x17C6, 0x17C6 },`
		54	`{ 0x17C9, 0x17D3 }, { 0x180B, 0x180E }, { 0x18A9, 0x18A9 },`
		55	`{ 0x200B, 0x200F }, { 0x202A, 0x202E }, { 0x206A, 0x206F },`
		56	`{ 0x20D0, 0x20E3 }, { 0x302A, 0x302F }, { 0x3099, 0x309A },`
		57	`{ 0xFB1E, 0xFB1E }, { 0xFE20, 0xFE23 }, { 0xFEFF, 0xFEFF },`
		58	`{ 0xFFF9, 0xFFFB }`
		59	`};`
		60
		61	`/* Auxiliary function for binary search in interval table */`
		62	`static BOOL`
		63	`bIsZeroWidthChar(ULONG ucs)`
		64	`{`
		65	`int low = 0;`
		66	`int high = elementsof(combining) - 1;`
		67	`int mid;`
		68
		69	`if (ucs < (ULONG)combining[low].first \|\|`
		70	`ucs > (ULONG)combining[high].last) {`
		71	`return FALSE;`
		72	`}`
		73
		74	`while (high >= low) {`
		75	`mid = (low + high) / 2;`
		76	`if (ucs > (ULONG)combining[mid].last) {`
		77	`low = mid + 1;`
		78	`} else if (ucs < (ULONG)combining[mid].first) {`
		79	`high = mid - 1;`
		80	`} else {`
		81	`return TRUE;`
		82	`}`
		83	`}`
		84	`return FALSE;`
		85	`} /* end of bIsZeroWidthChar */`
		86
		87	`/* The following functions define the column width of an ISO 10646`
		88	`* character as follows:`
		89	`*`
		90	`* - The null character (U+0000) has a column width of 0.`
		91	`*`
		92	`* - Other C0/C1 control characters and DEL will lead to a return`
		93	`* value of -1.`
		94	`*`
		95	`* - Non-spacing and enclosing combining characters (general`
		96	`* category code Mn or Me in the Unicode database) have a`
		97	`* column width of 0.`
		98	`*`
		99	`* - Other format characters (general category code Cf in the Unicode`
		100	`* database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.`
		101	`*`
		102	`* - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)`
		103	`* have a column width of 0.`
		104	`*`
		105	`* - Spacing characters in the East Asian Wide (W) or East Asian`
		106	`* FullWidth (F) category as defined in Unicode Technical`
		107	`* Report #11 have a column width of 2.`
		108	`*`
		109	`* - All remaining characters (including all printable`
		110	`* ISO 8859-1 and WGL4 characters, Unicode control characters,`
		111	`* etc.) have a column width of 1.`
		112	`*`
		113	`* This implementation assumes that all characters are encoded`
		114	`* in ISO 10646.`
		115	`*`
		116	`* This function is not named wcwidth() to prevent name clashes`
		117	`*/`
		118	`static int`
		119	`iWcWidth(ULONG ucs)`
		120	`{`
		121	`/* Test for 8-bit control characters */`
		122	`if (ucs == 0) {`
		123	`return 0;`
		124	`}`
		125	`if (ucs < 0x20 \|\| (ucs >= 0x7f && ucs < 0xa0)) {`
		126	`NO_DBG_HEX(ucs);`
		127	`return -1;`
		128	`}`
		129
		130	`/* Binary search in table of non-spacing characters */`
		131	`if (bIsZeroWidthChar(ucs)) {`
		132	`return 0;`
		133	`}`
		134
		135	`/* Ucs is not a combining or C0/C1 control character */`
		136
		137	`return 1 +`
		138	`(ucs >= 0x1100 &&`
		139	`(ucs <= 0x115f \|\| /* Hangul Jamo init. consonants */`
		140	`(ucs >= 0x2e80 && ucs <= 0xa4cf && (ucs & ~0x0011) != 0x300a &&`
		141	`ucs != 0x303f) \|\| /* CJK ... Yi */`
		142	`(ucs >= 0xac00 && ucs <= 0xd7a3) \|\| /* Hangul Syllables */`
		143	`(ucs >= 0xf900 && ucs <= 0xfaff) \|\| /* CJK Compatibility Ideographs */`
		144	`(ucs >= 0xfe30 && ucs <= 0xfe6f) \|\| /* CJK Compatibility Forms */`
		145	`(ucs >= 0xff00 && ucs <= 0xff5f) \|\| /* Fullwidth Forms */`
		146	`(ucs >= 0xffe0 && ucs <= 0xffe6) \|\|`
		147	`(ucs >= 0x20000 && ucs <= 0x2ffff)));`
		148	`} /* end of iWcWidth */`
		149
		150	`/*`
		151	`* utf8_to_ucs - convert from UTF-8 to UCS`
		152	`*`
		153	`* Returns the UCS character,`
		154	`* Fills in the number of bytes in the UTF-8 character`
		155	`*/`
		156	`static ULONG`
		157	`utf8_to_ucs(const char p, int iStrLen, int piUtfLen)`
		158	`{`
		159	`ULONG ulUcs;`
		160	`int iIndex, iCharLen;`
		161
		162	`fail(p == NULL \|\| piUtfLen == NULL);`
		163	`fail(iStrLen < 1);`
		164
		165	`ulUcs = (ULONG)(UCHAR)p[0];`
		166
		167	`if (ulUcs < 0x80) {`
		168	`*piUtfLen = 1;`
		169	`return ulUcs;`
		170	`}`
		171
		172	`if (ulUcs < 0xe0){`
		173	`iCharLen = 2;`
		174	`ulUcs &= 0x1f;`
		175	`} else if (ulUcs < 0xf0){`
		176	`iCharLen = 3;`
		177	`ulUcs &= 0x0f;`
		178	`} else if (ulUcs < 0xf8){`
		179	`iCharLen = 4;`
		180	`ulUcs &= 0x07;`
		181	`} else if (ulUcs < 0xfc){`
		182	`iCharLen = 5;`
		183	`ulUcs &= 0x03;`
		184	`} else {`
		185	`iCharLen = 6;`
		186	`ulUcs &= 0x01;`
		187	`}`
		188	`for (iIndex = 1; iIndex < iCharLen; iIndex++) {`
		189	`ulUcs <<= 6;`
		190	`if (iIndex < iStrLen) {`
		191	`ulUcs \|= (ULONG)(UCHAR)p[iIndex] & 0x3f;`
		192	`}`
		193	`}`
		194	`*piUtfLen = iCharLen;`
		195	`return ulUcs;`
		196	`} /* end of utf8_to_ucs */`
		197
		198	`/*`
		199	`* utf8_strwidth - compute the string width of an UTF-8 string`
		200	`*`
		201	`* Returns the string width in columns`
		202	`*/`
		203	`long`
		204	`utf8_strwidth(const char *pcString, size_t tNumchars)`
		205	`{`
		206	`ULONG ulUcs;`
		207	`long lTotal;`
		208	`int iToGo, iWidth, iUtflen;`
		209
		210	`fail(pcString == NULL \|\| tNumchars > (size_t)INT_MAX);`
		211
		212	`lTotal = 0;`
		213	`iToGo = (int)tNumchars;`
		214
		215	`while (iToGo > 0 && *pcString != '\0') {`
		216	`ulUcs = utf8_to_ucs(pcString, iToGo, &iUtflen);`
		217	`iWidth = iWcWidth(ulUcs);`
		218	`if (iWidth > 0) {`
		219	`lTotal += iWidth;`
		220	`}`
		221	`pcString += iUtflen;`
		222	`iToGo -= iUtflen;`
		223	`}`
		224	`NO_DBG_DEC(lTotal);`
		225	`return lTotal;`
		226	`} /* end of utf8_strwidth */`
		227
		228	`/*`
		229	`* utf8_chrlength - get the number of bytes in an UTF-8 character`
		230	`*`
		231	`* Returns the number of bytes`
		232	`*/`
		233	`int`
		234	`utf8_chrlength(const char *p)`
		235	`{`
		236	`int iUtflen;`
		237
		238	`fail(p == NULL);`
		239
		240	`iUtflen = -1; /* Just to make sure */`
		241	`(void)utf8_to_ucs(p, INT_MAX, &iUtflen);`
		242	`NO_DBG_DEC(iUtflen);`
		243	`return iUtflen;`
		244	`} /* end of utf8_chrlength */`
		245
		246	`/*`
		247	`* is_locale_utf8 - return TRUE if the locale is UTF-8`
		248	`*/`
		249	`BOOL`
		250	`is_locale_utf8(void)`
		251	`{`
		252	`char szCodeset[20];`
		253
		254	`szCodeset[0] = '\0';`
		255	`if (!bGetNormalizedCodeset(szCodeset, sizeof(szCodeset), NULL)) {`
		256	`return FALSE;`
		257	`}`
		258	`DBG_MSG(szCodeset);`
		259	`return STREQ(szCodeset, "utf8");`
		260	`} /* end of is_locale_utf8 */`

Subversion Repositories planix.SVN

(root)/os/trunk/sys/src/cmd/aux/antiword/utf8.c – Rev 2