Annotation of OpenXM_contrib2/windows/mcpp/mbchar.c, Revision 1.1
1.1 ! ohara 1: /* $OpenXM$ */
! 2: /*-
! 3: * Copyright (c) 1998, 2002-2008 Kiyoshi Matsui <kmatsui@t3.rim.or.jp>
! 4: * All rights reserved.
! 5: *
! 6: * Some parts of this code are derived from the public domain software
! 7: * DECUS cpp (1984,1985) written by Martin Minow.
! 8: *
! 9: * Redistribution and use in source and binary forms, with or without
! 10: * modification, are permitted provided that the following conditions
! 11: * are met:
! 12: * 1. Redistributions of source code must retain the above copyright
! 13: * notice, this list of conditions and the following disclaimer.
! 14: * 2. Redistributions in binary form must reproduce the above copyright
! 15: * notice, this list of conditions and the following disclaimer in the
! 16: * documentation and/or other materials provided with the distribution.
! 17: *
! 18: * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
! 19: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
! 20: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
! 21: * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE
! 22: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
! 23: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
! 24: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
! 25: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
! 26: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
! 27: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
! 28: * SUCH DAMAGE.
! 29: */
! 30:
! 31: /*
! 32: * M B C H A R . C
! 33: * C h a r a c t e r h a n d l i n g R o u t i n e s
! 34: *
! 35: * Character handling and multi-byte character handling routines are
! 36: * placed here.
! 37: */
! 38:
! 39: #if PREPROCESSED
! 40: #include "mcpp.H"
! 41: #else
! 42: #include "system.H"
! 43: #include "internal.H"
! 44: #endif
! 45:
! 46: /*
! 47: * Tables of character types and multi-byte character types.
! 48: *
! 49: * Some of these character attributes will be overwritten by
! 50: * execution time option '-@post' or '-@old'.
! 51: * Warning on erroneous sequence will be issued from the caller routines:
! 52: * scan_quote(), scan_id() or scan_number().
! 53: */
! 54:
! 55: /* Non-ASCII characters are always checked by mb_read(). */
! 56: #define NA 0x4000 /* Non-ASCII characters */
! 57:
! 58: /* Horizontal spaces (' ', '\t' and TOK_SEP) */
! 59: #define HSPA (SPA | HSP)
! 60:
! 61: short * char_type; /* Pointer to one of the following type_*[]. */
! 62:
! 63: #define EJ1 0x100 /* 1st byte of EUC_JP */
! 64: #define EJ2 0x200 /* 2nd byte of EUC_JP */
! 65: #define GB1 0x400 /* 1st byte of GB2312 */
! 66: #define GB2 0x800 /* 2nd byte of GB2312 */
! 67: #define KS1 0x1000 /* 1st byte of KSC5601 */
! 68: #define KS2 0x2000 /* 2nd byte of KSC5601 */
! 69:
! 70: #define EJ12 (EJ1 | EJ2) /* 1st byte or 2nd byte of EUC_JP */
! 71: #define GB12 (GB1 | GB2)
! 72: #define KS12 (KS1 | KS2)
! 73: #define EJ1N (NA | EJ1)
! 74: #define EU12N (NA | EJ12 | GB12 | KS12)
! 75: /* 1st or 2nd byte of EUC_JP, GB2312 or KSC5601, or any other non-ASCII */
! 76:
! 77: static short type_euc[ UCHARMAX + 1] = {
! 78: /*
! 79: * For EUC_JP, GB2312, KSC5601 or other similar multi-byte char encodings.
! 80: */
! 81:
! 82: /* Character type codes */
! 83: /* 0, 1, 2, 3, 4, 5, 6, 7, */
! 84: /* 8, 9, A, B, C, D, E, F, Hex */
! 85:
! 86: 000, 000, 000, 000, 000, 000, 000, 000, /* 00 */
! 87: 000, HSPA, SPA, SPA, SPA, SPA, 000, 000, /* 08 */
! 88: 000, 000, 000, 000, 000, 000, 000, 000, /* 10 */
! 89: /* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts() */
! 90: 000, LET, LET, 000, 000, 000, 000, HSPA, /* 18 */
! 91: HSPA, PUNC, QUO, PUNC, 000, PUNC, PUNC, QUO, /* 20 !"#$%&' */
! 92: PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, DOT, PUNC, /* 28 ()*+,-./ */
! 93: DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, /* 30 01234567 */
! 94: DIG, DIG, PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, /* 38 89:;<=>? */
! 95:
! 96: 000, LET, LET, LET, LET, LET, LET, LET, /* 40 @ABCDEFG */
! 97: LET, LET, LET, LET, LET, LET, LET, LET, /* 48 HIJKLMNO */
! 98: LET, LET, LET, LET, LET, LET, LET, LET, /* 50 PQRSTUVW */
! 99: LET, LET, LET, PUNC, 000, PUNC, PUNC, LET, /* 58 XYZ[\]^_ */
! 100: 000, LET, LET, LET, LET, LET, LET, LET, /* 60 `abcdefg */
! 101: LET, LET, LET, LET, LET, LET, LET, LET, /* 68 hijklmno */
! 102: LET, LET, LET, LET, LET, LET, LET, LET, /* 70 pqrstuvw */
! 103: LET, LET, LET, PUNC, PUNC, PUNC, PUNC, 000, /* 78 xyz{|}~ */
! 104:
! 105: NA, NA, NA, NA, NA, NA, NA, NA, /* 80 .. 87 */
! 106: NA, NA, NA, NA, NA, NA, EJ1N, NA, /* 88 .. 8F */
! 107: NA, NA, NA, NA, NA, NA, NA, NA, /* 90 .. 97 */
! 108: NA, NA, NA, NA, NA, NA, NA, NA, /* 98 .. 9F */
! 109: NA, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* A0 .. A7 */
! 110: EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* A8 .. AF */
! 111: EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* B0 .. B7 */
! 112: EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* B8 .. BF */
! 113: EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* C0 .. C7 */
! 114: EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* C8 .. CF */
! 115: EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* D0 .. D7 */
! 116: EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* D8 .. DF */
! 117: EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* E0 .. E7 */
! 118: EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* E8 .. EF */
! 119: EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* F0 .. F7 */
! 120: EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, NA, /* F8 .. FF */
! 121: };
! 122:
! 123: static short type_bsl[ UCHARMAX + 1] = {
! 124: /*
! 125: * For SJIS, BIGFIVE or other similar encodings which may have '\\' value as
! 126: * the second byte of multi-byte character.
! 127: */
! 128:
! 129: #define SJ1 0x100 /* 1st byte of SJIS */
! 130: #define SJ2 0x200 /* 2nd byte of SJIS */
! 131: #define BF1 0x400 /* 1st byte of BIGFIVE */
! 132: #define BF2 0x800 /* 2nd byte of BIGFIVE */
! 133:
! 134: #define SB2 (SJ2 | BF2)
! 135: #define SJ2N (NA | SJ2)
! 136: #define SB2N (NA | SJ2 | BF2)
! 137: #define SJ12N (NA | SJ1 | SJ2)
! 138: #define BF12N (NA | BF1 | BF2)
! 139: #define SB12N (NA | SJ1 | SJ2 | BF1 | BF2)
! 140: #define S2B12N (NA | SJ2 | BF1 | BF2)
! 141:
! 142: #define LSB2 (LET | SB2)
! 143: #define PSB2 (PUNC| SB2)
! 144:
! 145: /* Character type codes */
! 146: /* 0, 1, 2, 3, 4, 5, 6, 7, */
! 147: /* 8, 9, A, B, C, D, E, F, Hex */
! 148:
! 149: 000, 000, 000, 000, 000, 000, 000, 000, /* 00 */
! 150: 000, HSPA, SPA, SPA, SPA, SPA, 000, 000, /* 08 */
! 151: 000, 000, 000, 000, 000, 000, 000, 000, /* 10 */
! 152: /* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts() */
! 153: 000, LET, LET, 000, 000, 000, 000, HSPA, /* 18 */
! 154: HSPA, PUNC, QUO, PUNC, 000, PUNC, PUNC, QUO, /* 20 !"#$%&' */
! 155: PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, DOT, PUNC, /* 28 ()*+,-./ */
! 156: DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, /* 30 01234567 */
! 157: DIG, DIG, PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, /* 38 89:;<=>? */
! 158:
! 159: SB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 40 @ABCDEFG */
! 160: LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 48 HIJKLMNO */
! 161: LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 50 PQRSTUVW */
! 162: LSB2, LSB2, LSB2, PSB2, SB2, PSB2, PSB2, LSB2, /* 58 XYZ[\]^_ */
! 163: SB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 60 `abcdefg */
! 164: LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 68 hijklmno */
! 165: LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 70 pqrstuvw */
! 166: LSB2, LSB2, LSB2, PSB2, PSB2, PSB2, PSB2, 000, /* 78 xyz{|}~ */
! 167:
! 168: SB2N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, /* 80 .. 87 */
! 169: SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, /* 88 .. 8F */
! 170: SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, /* 90 .. 97 */
! 171: SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, /* 98 .. 9F */
! 172: SJ2N, S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* A0 .. A7 */
! 173: S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* A8 .. AF */
! 174: S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* B0 .. B7 */
! 175: S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* B8 .. BF */
! 176: S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* C0 .. C7 */
! 177: S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* C8 .. CF */
! 178: S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* D0 .. D7 */
! 179: S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* D8 .. DF */
! 180: SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, /* E0 .. E7 */
! 181: SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, /* E8 .. EF */
! 182: SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, /* F0 .. F7 */
! 183: SB12N, SB12N, SB12N, SB12N, SB12N, BF12N, BF12N, NA, /* F8 .. FF */
! 184: };
! 185:
! 186: /*
! 187: * For ISO2022_JP multi-byte character encoding.
! 188: */
! 189:
! 190: #define IS1 0x100 /* 1st byte of shift-sequence */
! 191: #define IS2 0x200 /* 2nd byte of shift-sequence */
! 192: #define IS3 0x400 /* 3rd byte of shift-sequence */
! 193: #define IS4 0x800 /* 4th byte of shift-sequence */
! 194: #define IJP 0x1000 /* 1st or 2nd byte of ISO-2022-JP (ISO-2022-JP1) */
! 195:
! 196: #define PIJP (PUNC | IJP)
! 197: #define QIJP (QUO | IJP)
! 198: #define DTJP (DOT | IJP)
! 199: #define DGJP (DIG | IJP)
! 200: #define LIJP (LET | IJP)
! 201:
! 202: #define JPS2 (IJP | IS2)
! 203: #define PJPS23 (PIJP | IS2 | IS3)
! 204: #define LJPS3 (LIJP | IS3)
! 205: #define LJPS4 (LIJP | IS4)
! 206:
! 207: static short type_iso2022_jp[ UCHARMAX + 1] = {
! 208:
! 209: /* Character type codes */
! 210: /* 0, 1, 2, 3, 4, 5, 6, 7, */
! 211: /* 8, 9, A, B, C, D, E, F, Hex */
! 212:
! 213: 000, 000, 000, 000, 000, 000, 000, 000, /* 00 */
! 214: 000, HSPA, SPA, SPA, SPA, SPA, 000, 000, /* 08 */
! 215: 000, 000, 000, 000, 000, 000, 000, 000, /* 10 */
! 216: /* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts() */
! 217: 000, LET, LET, IS1, 000, 000, 000, HSPA, /* 18 */
! 218: HSPA, PIJP, QIJP, PIJP, JPS2, PIJP, PIJP, QIJP, /* 20 !"#$%&' */
! 219: PJPS23,PIJP, PIJP, PIJP, PIJP, PIJP, DTJP, PIJP, /* 28 ()*+,-./ */
! 220: DGJP, DGJP, DGJP, DGJP, DGJP, DGJP, DGJP, DGJP, /* 30 01234567 */
! 221: DGJP, DGJP, PIJP, PIJP, PIJP, PIJP, PIJP, PIJP, /* 38 89:;<=>? */
! 222:
! 223: IJP, LIJP, LJPS3, LIJP, LJPS4, LIJP, LIJP, LIJP, /* 40 @ABCDEFG */
! 224: LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, /* 48 HIJKLMNO */
! 225: LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, /* 50 PQRSTUVW */
! 226: LIJP, LIJP, LIJP, PIJP, IJP, PIJP, PIJP, LIJP, /* 58 XYZ[\]^_ */
! 227: IJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, /* 60 `abcdefg */
! 228: LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, /* 68 hijklmno */
! 229: LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, /* 70 pqrstuvw */
! 230: LIJP, LIJP, LIJP, PIJP, PIJP, PIJP, PIJP, 000, /* 78 xyz{|}~ */
! 231:
! 232: NA, NA, NA, NA, NA, NA, NA, NA, /* 80 .. 87 */
! 233: NA, NA, NA, NA, NA, NA, NA, NA, /* 88 .. 8F */
! 234: NA, NA, NA, NA, NA, NA, NA, NA, /* 90 .. 97 */
! 235: NA, NA, NA, NA, NA, NA, NA, NA, /* 98 .. 9F */
! 236: NA, NA, NA, NA, NA, NA, NA, NA, /* A0 .. A7 */
! 237: NA, NA, NA, NA, NA, NA, NA, NA, /* A8 .. AF */
! 238: NA, NA, NA, NA, NA, NA, NA, NA, /* B0 .. B7 */
! 239: NA, NA, NA, NA, NA, NA, NA, NA, /* B8 .. BF */
! 240: NA, NA, NA, NA, NA, NA, NA, NA, /* C0 .. C7 */
! 241: NA, NA, NA, NA, NA, NA, NA, NA, /* C8 .. CF */
! 242: NA, NA, NA, NA, NA, NA, NA, NA, /* D0 .. D7 */
! 243: NA, NA, NA, NA, NA, NA, NA, NA, /* D8 .. DF */
! 244: NA, NA, NA, NA, NA, NA, NA, NA, /* E0 .. E7 */
! 245: NA, NA, NA, NA, NA, NA, NA, NA, /* E8 .. EF */
! 246: NA, NA, NA, NA, NA, NA, NA, NA, /* F0 .. F7 */
! 247: NA, NA, NA, NA, NA, NA, NA, NA, /* F8 .. FF */
! 248: };
! 249:
! 250: /*
! 251: * For UTF8 multi-byte character encoding.
! 252: */
! 253:
! 254: #define U2_1 0x100 /* 1st byte of 2-byte encoding of UTF8 */
! 255: #define U3_1 0x200 /* 1st byte of 3-byte encoding of UTF8 */
! 256: #define U4_1 0x400 /* 1st byte of 4-byte encoding of UTF8 */
! 257: #define UCONT 0x800 /* Continuation of a 2, 3, or 4 byte UTF8 sequence */
! 258: #define U2_1N (NA | U2_1)
! 259: #define U3_1N (NA | U3_1)
! 260: #define U4_1N (NA | U4_1)
! 261: #define UCONTN (NA | UCONT)
! 262:
! 263: static short type_utf8[ UCHARMAX + 1] = {
! 264:
! 265: /* Character type codes */
! 266: /* 0, 1, 2, 3, 4, 5, 6, 7, */
! 267: /* 8, 9, A, B, C, D, E, F, Hex */
! 268:
! 269: 000, 000, 000, 000, 000, 000, 000, 000, /* 00 */
! 270: 000, HSPA, SPA, SPA, SPA, SPA, 000, 000, /* 08 */
! 271: 000, 000, 000, 000, 000, 000, 000, 000, /* 10 */
! 272: /* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts() */
! 273: 000, LET, LET, 000, 000, 000, 000, HSPA, /* 18 */
! 274: HSPA, PUNC, QUO, PUNC, 000, PUNC, PUNC, QUO, /* 20 !"#$%&' */
! 275: PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, DOT, PUNC, /* 28 ()*+,-./ */
! 276: DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, /* 30 01234567 */
! 277: DIG, DIG, PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, /* 38 89:;<=>? */
! 278:
! 279: 000, LET, LET, LET, LET, LET, LET, LET, /* 40 @ABCDEFG */
! 280: LET, LET, LET, LET, LET, LET, LET, LET, /* 48 HIJKLMNO */
! 281: LET, LET, LET, LET, LET, LET, LET, LET, /* 50 PQRSTUVW */
! 282: LET, LET, LET, PUNC, 000, PUNC, PUNC, LET, /* 58 XYZ[\]^_ */
! 283: 000, LET, LET, LET, LET, LET, LET, LET, /* 60 `abcdefg */
! 284: LET, LET, LET, LET, LET, LET, LET, LET, /* 68 hijklmno */
! 285: LET, LET, LET, LET, LET, LET, LET, LET, /* 70 pqrstuvw */
! 286: LET, LET, LET, PUNC, PUNC, PUNC, PUNC, 000, /* 78 xyz{|}~ */
! 287:
! 288: UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* 80 .. 87 */
! 289: UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* 88 .. 8F */
! 290: UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* 90 .. 97 */
! 291: UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* 98 .. 9F */
! 292: UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* A0 .. A7 */
! 293: UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* A8 .. AF */
! 294: UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* B0 .. B7 */
! 295: UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* B8 .. BF */
! 296: NA, NA, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, /* C0 .. C7 */
! 297: U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, /* C8 .. CF */
! 298: U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, /* D0 .. D7 */
! 299: U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, /* D8 .. DF */
! 300: U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, /* E0 .. E7 */
! 301: U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, /* E8 .. EF */
! 302: U4_1N, U4_1N, U4_1N, U4_1N, U4_1N, NA, NA, NA, /* F0 .. F7 */
! 303: NA, NA, NA, NA, NA, NA, NA, NA, /* F8 .. FF */
! 304: };
! 305:
! 306: #define SETLOCALE 2 /* #pragma setlocale (not __setlocale) */
! 307:
! 308: #define NUM_ENCODING 8
! 309: #define NUM_ALIAS 6
! 310:
! 311: /* Names of encoding recognized. Table for search_encoding(). */
! 312: static const char * const encoding_name[ NUM_ENCODING][ NUM_ALIAS] = {
! 313: /* Visual C full, Visual C short
! 314: , 4 miscellaneous */
! 315: { "english", "c"
! 316: , "c", "en", "latin", "iso8859"},
! 317: { "", ""
! 318: , "eucjp", "euc", "ujis", ""},
! 319: { "chinesesimplified", "chs"
! 320: , "gb2312", "cngb", "euccn", ""},
! 321: { "korean", "kor"
! 322: , "ksc5601", "ksx1001", "wansung", "euckr"},
! 323: { "japanese", "jpn"
! 324: , "sjis", "shiftjis", "mskanji", ""},
! 325: { "chinesetraditional", "cht"
! 326: , "bigfive", "big5", "cnbig5", "euctw"},
! 327: { "", ""
! 328: , "iso2022jp", "iso2022jp1", "jis", ""},
! 329: { "", ""
! 330: , "utf8", "utf", "", ""},
! 331: };
! 332:
! 333: static int mbstart;
! 334: static int mb2;
! 335:
! 336: static size_t mb_read_2byte( int c1, char ** in_pp, char ** out_pp);
! 337: /* For 2-byte encodings of mbchar */
! 338: static const char * search_encoding( char * norm, int alias);
! 339: /* Search encoding_name[][] table */
! 340: static void strip_bar( char * string);
! 341: /* Remove '_', '-' or '.' in the string */
! 342: static void conv_case( char * name, char * lim, int upper);
! 343: /* Convert to upper/lower case */
! 344: static size_t mb_read_iso2022_jp( int c1, char ** in_pp, char ** out_pp);
! 345: /* For ISO2022_JP encoding */
! 346: static size_t mb_read_utf8( int c1, char ** in_pp, char ** out_pp);
! 347: /* For UTF8 mbchar encoding */
! 348:
! 349: #define NAMLEN 20
! 350: #define UPPER 1 /* To upper */
! 351: #define LOWER 0 /* To lower */
! 352:
! 353:
! 354: const char * set_encoding(
! 355: char * name, /* Name of encoding specified */
! 356: char * env, /* Name of environment variable */
! 357: int pragma
! 358: /* 2: #pragma setlocale, 1: #pragma __setlocale, 0: not #pragma */
! 359: )
! 360: /*
! 361: * Search the encoding specified and re-initialize mbchar settings.
! 362: */
! 363: {
! 364: const char * unknown_encoding
! 365: = "Unknown encoding: %s%.0ld%.0s"; /* _W1_ */
! 366: const char * too_long
! 367: = "Too long encoding name: %s%.0ld%.0s"; /* _E_ */
! 368: const char * loc = "";
! 369: int alias;
! 370: char norm[ NAMLEN];
! 371: /*
! 372: * Normalized name (removed 'xxxxx.', stripped '_', '-', '.'
! 373: * and lowered.
! 374: */
! 375:
! 376: if (strlen( name) >= NAMLEN) {
! 377: if ((env || pragma) && (warn_level & 1)) {
! 378: cwarn( too_long, name, 0L, NULL);
! 379: } else {
! 380: mcpp_fprintf( ERR, too_long, name);
! 381: mcpp_fputc( '\n', ERR);
! 382: }
! 383: }
! 384: strcpy( norm, name);
! 385: if (norm[ 5] == '.')
! 386: memmove( norm, norm + 5, strlen( norm + 5) + 1);
! 387: /* Remove initial 'xxxxx.' as 'ja_JP.', 'en_US.' or any other */
! 388: conv_case( norm, norm + strlen( norm), LOWER);
! 389: strip_bar( norm);
! 390:
! 391: if (strlen( name) == 0) { /* "" */
! 392: mbchar = MBCHAR; /* Restore to the default encoding */
! 393: } else if (memcmp( norm, "iso8859", 7) == 0 /* iso8859* */
! 394: || memcmp( norm, "latin", 5) == 0 /* latin* */
! 395: || memcmp( norm, "en", 2) == 0) { /* en* */
! 396: mbchar = 0; /* No multi-byte character */
! 397: } else {
! 398: alias = 2;
! 399: #if COMPILER == MSC
! 400: if (pragma == SETLOCALE) /* #pragma setlocale */
! 401: alias = 0;
! 402: #endif
! 403: loc = search_encoding( norm, alias); /* Search the name */
! 404: }
! 405: if (loc == NULL) {
! 406: if ((env || pragma) && (warn_level & 1)) {
! 407: cwarn( unknown_encoding, name, 0L, NULL);
! 408: } else { /* -m option */
! 409: mcpp_fprintf( ERR, unknown_encoding, name);
! 410: mcpp_fputc( '\n', ERR);
! 411: }
! 412: } else {
! 413: mb_init(); /* Re-initialize */
! 414: }
! 415: return loc;
! 416: }
! 417:
! 418: static const char * search_encoding(
! 419: char * norm, /* The name of encoding specified */
! 420: int alias /* The number of alias to start searching */
! 421: )
! 422: {
! 423: const char * loc;
! 424: int lo, al;
! 425:
! 426: for (lo = 0; lo < NUM_ENCODING; lo++) {
! 427: for (al = alias ; al < NUM_ALIAS; al++) {
! 428: loc = encoding_name[ lo][ al];
! 429: if (str_eq( loc, norm)) {
! 430: switch (lo) {
! 431: case 0 : mbchar = 0; break;
! 432: case 1 : mbchar = EUC_JP; break;
! 433: case 2 : mbchar = GB2312; break;
! 434: case 3 : mbchar = KSC5601; break;
! 435: case 4 : mbchar = SJIS; break;
! 436: case 5 : mbchar = BIGFIVE; break;
! 437: case 6 : mbchar = ISO2022_JP; break;
! 438: case 7 : mbchar = UTF8; break;
! 439: }
! 440: return loc;
! 441: }
! 442: }
! 443: }
! 444: return NULL;
! 445: }
! 446:
! 447: static void strip_bar(
! 448: char * string
! 449: )
! 450: /*
! 451: * Strip '_', '-' or '.' in the string.
! 452: */
! 453: {
! 454: char * cp = string;
! 455:
! 456: while (*cp != EOS) {
! 457: if (*cp == '_' || *cp == '-' || *cp == '.')
! 458: memmove( cp, cp + 1, strlen( cp));
! 459: else
! 460: cp++;
! 461: }
! 462: }
! 463:
! 464: static void conv_case(
! 465: char * name, /* (diretory) Name */
! 466: char * lim, /* End of (directory) name */
! 467: int upper /* TRUE if to upper */
! 468: )
! 469: /* Convert a string to upper-case letters or lower-case letters in-place */
! 470: {
! 471: int c;
! 472: char * sp;
! 473:
! 474: for (sp = name; sp < lim; sp++) {
! 475: c = *sp & UCHARMAX;
! 476: #if MBCHAR
! 477: if ((char_type[ c] & mbstart)) {
! 478: char tmp[ PATHMAX+1];
! 479: char * tp = tmp;
! 480: *tp++ = *sp++;
! 481: mb_read( c, &sp, &tp);
! 482: } else
! 483: #endif
! 484: {
! 485: if (upper)
! 486: *sp = toupper( c);
! 487: else
! 488: *sp = tolower( c);
! 489: }
! 490: }
! 491: }
! 492:
! 493: void mb_init( void)
! 494: /*
! 495: * Initialize multi-byte character settings.
! 496: * First called prior to setting the 'mcpp_mode'.
! 497: * Will be called again each time the multibyte character encoding is changed.
! 498: */
! 499: {
! 500: /*
! 501: * Select the character classification table, select the multi-byte
! 502: * character reading routine and decide whether multi-byte character
! 503: * may contain the byte of value 0x5c.
! 504: */
! 505: switch (mbchar) {
! 506: case 0 :
! 507: case EUC_JP :
! 508: case GB2312 :
! 509: case KSC5601 :
! 510: char_type = type_euc;
! 511: bsl_in_mbchar = FALSE;
! 512: mb_read = mb_read_2byte;
! 513: break;
! 514: case SJIS :
! 515: case BIGFIVE :
! 516: char_type = type_bsl;
! 517: bsl_in_mbchar = TRUE;
! 518: mb_read = mb_read_2byte;
! 519: break;
! 520: case ISO2022_JP :
! 521: char_type = type_iso2022_jp;
! 522: bsl_in_mbchar = TRUE;
! 523: mb_read = mb_read_iso2022_jp;
! 524: break;
! 525: case UTF8 :
! 526: char_type = type_utf8;
! 527: bsl_in_mbchar = FALSE;
! 528: mb_read = mb_read_utf8;
! 529: break;
! 530: }
! 531:
! 532: /* Set the bit patterns for character classification. */
! 533: switch (mbchar) {
! 534: case 0 :
! 535: mbstart = 0;
! 536: break;
! 537: case EUC_JP :
! 538: mbstart = EJ1;
! 539: mb2 = EJ2;
! 540: break;
! 541: case GB2312 :
! 542: mbstart = GB1;
! 543: mb2 = GB2;
! 544: break;
! 545: case KSC5601:
! 546: mbstart = KS1;
! 547: mb2 = KS2;
! 548: break;
! 549: case SJIS :
! 550: mbstart = SJ1;
! 551: mb2 = SJ2;
! 552: break;
! 553: case BIGFIVE:
! 554: mbstart = BF1;
! 555: mb2 = BF2;
! 556: break;
! 557: case ISO2022_JP :
! 558: mbstart = IS1;
! 559: break;
! 560: case UTF8 :
! 561: mbstart = (U2_1 | U3_1 | U4_1);
! 562: break;
! 563: }
! 564: switch (mbchar) {
! 565: case 0 :
! 566: mbchk = 0;
! 567: break;
! 568: case EUC_JP :
! 569: case GB2312 :
! 570: case KSC5601:
! 571: case SJIS :
! 572: case BIGFIVE:
! 573: case UTF8 :
! 574: mbchk = NA;
! 575: break;
! 576: case ISO2022_JP :
! 577: mbchk = (IS1 | NA);
! 578: break;
! 579: }
! 580:
! 581: /*
! 582: * Set special handling for some encodings to supplement some compiler's
! 583: * deficiency.
! 584: */
! 585: switch (mbchar) {
! 586: case SJIS :
! 587: #if ! SJIS_IS_ESCAPE_FREE
! 588: bsl_need_escape = TRUE;
! 589: #endif
! 590: break;
! 591: case BIGFIVE:
! 592: #if ! BIGFIVE_IS_ESCAPE_FREE
! 593: bsl_need_escape = TRUE;
! 594: #endif
! 595: break;
! 596: case ISO2022_JP :
! 597: #if ! ISO2022_JP_IS_ESCAPE_FREE
! 598: bsl_need_escape = TRUE;
! 599: #endif
! 600: break;
! 601: default :
! 602: bsl_need_escape = FALSE;
! 603: break;
! 604: }
! 605:
! 606: /*
! 607: * Modify magic characters in character type table.
! 608: * char_type[] table should be rewritten in accordance with the 'mcpp_mode'
! 609: * whenever the encoding is changed.
! 610: */
! 611: if (mcpp_mode) { /* If mcpp_mode is already set */
! 612: char_type[ DEF_MAGIC] = standard ? LET : 0;
! 613: char_type[ IN_SRC] = (mcpp_mode == STD) ? LET : 0;
! 614: char_type[ TOK_SEP] = (mcpp_mode == STD || mcpp_mode == OLD_PREP)
! 615: ? HSPA: 0; /* TOK_SEP equals to COM_SEP */
! 616: }
! 617: }
! 618:
! 619: static size_t mb_read_2byte(
! 620: int c1, /* The 1st byte of mbchar sequence (already read) */
! 621: char ** in_pp, /* Pointer to input */
! 622: char ** out_pp /* Pointer to output */
! 623: )
! 624: /*
! 625: * Multi-byte character reading routine for 2-byte encodings.
! 626: */
! 627: {
! 628: int error = FALSE;
! 629: size_t len = 0; /* Number of multi-byte characters read. */
! 630: char * in_p = *in_pp;
! 631: char * out_p = *out_pp;
! 632:
! 633: if (! (char_type[ c1 & UCHARMAX] & mbstart))
! 634: return MB_ERROR; /* Not a multi-byte character */
! 635:
! 636: do {
! 637: if (! (char_type[ (*out_p++ = *in_p++) & UCHARMAX] & mb2)) {
! 638: error = TRUE;
! 639: break;
! 640: }
! 641: len++;
! 642: } while (char_type[ (*out_p++ = *in_p++) & UCHARMAX] & mbstart);
! 643: *in_pp = --in_p;
! 644: *(--out_p) = EOS;
! 645: *out_pp = out_p;
! 646: return error ? (len | MB_ERROR) : len;
! 647: }
! 648:
! 649: static size_t mb_read_iso2022_jp(
! 650: int c1, /* The 1st byte of the sequence already read (always 0x1b). */
! 651: char ** in_pp,
! 652: char ** out_pp
! 653: )
! 654: /*
! 655: * Multi-byte character reading routine for ISO2022_JP.
! 656: */
! 657: {
! 658: int error = FALSE;
! 659: size_t len = 0;
! 660: char * in_p = *in_pp;
! 661: char * out_p = *out_pp;
! 662: int c2, c3, c4;
! 663:
! 664: if (! (char_type[ c1 & UCHARMAX] & mbstart))
! 665: return MB_ERROR;
! 666:
! 667: do {
! 668:
! 669: *out_p++ = c2 = *in_p++;
! 670: if (! (char_type[ c2 & UCHARMAX] & IS2)) {
! 671: error = TRUE;
! 672: break;
! 673: }
! 674: *out_p++ = c3 = *in_p++;
! 675: if (! (char_type[ c3 & UCHARMAX] & IS3)) {
! 676: error = TRUE;
! 677: break;
! 678: }
! 679:
! 680: switch (c2) {
! 681: case 0x24 :
! 682: switch (c3) {
! 683: case 0x42 : /* 0x1b 0x24 0x42: JIS X 0208-1983 */
! 684: break;
! 685: case 0x28 :
! 686: *out_p++ = c4 = *in_p++;
! 687: if (! (char_type[ c4 & UCHARMAX] & IS4))
! 688: error = TRUE;
! 689: /* else: 0x1b 0x24 0x28 0x44: JIS X 0212 */
! 690: break;
! 691: default :
! 692: error = TRUE;
! 693: }
! 694: break;
! 695: case 0x28 :
! 696: switch (c3) {
! 697: case 0x42 : /* 0x1b 0x28 0x42: ASCII */
! 698: c1 = *out_p++ = *in_p++ & UCHARMAX;
! 699: continue;
! 700: default :
! 701: error = TRUE;
! 702: }
! 703: break;
! 704: }
! 705: if (error)
! 706: break;
! 707:
! 708: while (char_type[ c1 = *out_p++ = (*in_p++ & UCHARMAX)] & IJP) {
! 709: if (! (char_type[ *out_p++ = (*in_p++ & UCHARMAX)] & IJP)) {
! 710: error = TRUE;
! 711: break;
! 712: }
! 713: len++; /* String of multi-byte characters */
! 714: }
! 715: if (error)
! 716: break;
! 717:
! 718: } while (char_type[ c1] & IS1); /* 0x1b: start of shift-sequence */
! 719:
! 720: *in_pp = --in_p;
! 721: *(--out_p) = EOS;
! 722: *out_pp = out_p;
! 723: return error ? (len | MB_ERROR) : len;
! 724: }
! 725:
! 726: static size_t mb_read_utf8(
! 727: int c1,
! 728: char ** in_pp,
! 729: char ** out_pp
! 730: )
! 731: /*
! 732: * Multi-byte character reading routine for UTF8.
! 733: */
! 734: {
! 735: int error = FALSE;
! 736: size_t len = 0;
! 737: char * in_p = *in_pp;
! 738: char * out_p = *out_pp;
! 739:
! 740: if (! (char_type[ c1 & UCHARMAX] & mbstart))
! 741: return MB_ERROR;
! 742:
! 743: do {
! 744: unsigned int codepoint;
! 745: int i, bytes;
! 746:
! 747: if ((char_type[ c1 & UCHARMAX] & U4_1) == U4_1)
! 748: bytes = 4; /* 4-byte character */
! 749: else if ((char_type[ c1 & UCHARMAX] & U3_1) == U3_1)
! 750: bytes = 3; /* 3-byte character */
! 751: else if ((char_type[ c1 & UCHARMAX] & U2_1) == U2_1)
! 752: bytes = 2; /* 2-byte character */
! 753:
! 754: /* Must ensure that the sequence is not reserved as a surrogate */
! 755: codepoint = ((2 << (6-bytes)) - 1) & c1; /* mask off top bits */
! 756:
! 757: /* All bytes left in the sequence must be in 0x80 - 0xBF */
! 758: for (i = bytes - 1; i && !error; i--) {
! 759: codepoint = (codepoint << 6) + ((*in_p) & 0x3fU);
! 760: if (! (char_type[ (*out_p++ = *in_p++) & UCHARMAX] & UCONT))
! 761: error = TRUE;
! 762: }
! 763:
! 764: /* Check for overlong/underlong sequences */
! 765: if ((bytes == 2 && (codepoint < 0x80 || codepoint > 0x7FF))
! 766: || (bytes == 3 && (codepoint < 0x800 || codepoint > 0xFFFF))
! 767: || (bytes == 4 && (codepoint < 0x10000 || codepoint > 0x10FFFF)))
! 768: error = TRUE;
! 769: if ((codepoint >= 0xD800 && codepoint <= 0xDFFF)
! 770: /* Check for reserved surrogate codepoints */
! 771: || (codepoint >= 0xFFFE && codepoint <= 0xFFFF))
! 772: /* Illegal */
! 773: error = TRUE;
! 774: #if 0
! 775: printf( "codepoint:0x%x\n", codepoint);
! 776: #endif
! 777: if (error)
! 778: break;
! 779: len++;
! 780: } while (char_type[ (*out_p++ = c1 = *in_p++) & UCHARMAX] & mbstart);
! 781: /* Start of the next multi-byte character */
! 782: *in_pp = --in_p;
! 783: *(--out_p) = EOS;
! 784: *out_pp = out_p;
! 785: return error ? (len | MB_ERROR) : len;
! 786: }
! 787:
! 788: uexpr_t mb_eval(
! 789: char ** seq_pp
! 790: )
! 791: /*
! 792: * Evaluate the value of a multi-byte character.
! 793: * This routine does not check the legality of the sequence.
! 794: * This routine is called from eval_char().
! 795: * This routine is never called in POST_STD mode.
! 796: */
! 797: {
! 798: char * seq = *seq_pp;
! 799: uexpr_t val = 0;
! 800: int c, c1;
! 801:
! 802: if (! (char_type[ c = *seq++ & UCHARMAX] & mbstart)) {
! 803: *seq_pp = seq;
! 804: return c; /* Not a multi-byte character */
! 805: }
! 806:
! 807: switch (mbchar) {
! 808: case EUC_JP :
! 809: case GB2312 :
! 810: case KSC5601:
! 811: case SJIS :
! 812: case BIGFIVE:
! 813: val = (c << 8) + (*seq++ & UCHARMAX);
! 814: /* Evaluate the 2-byte sequence */
! 815: break;
! 816: case ISO2022_JP :
! 817: if (char_type[ c & UCHARMAX] & IS1) { /* Skip shift-sequence */
! 818: if (char_type[ c = *seq++ & UCHARMAX] & IS2) {
! 819: if (char_type[ c1 = *seq++ & UCHARMAX] & IS3) {
! 820: if (c1 == 0x28)
! 821: seq++;
! 822: if (c == 0x28 && c1 == 0x42) { /* Shift-out sequence */
! 823: val = 0;
! 824: break;
! 825: }
! 826: c = *seq++ & UCHARMAX;
! 827: }
! 828: }
! 829: }
! 830: val = (c << 8) + (*seq++ & UCHARMAX); /* Evaluate the 2-bytes */
! 831: break;
! 832: case UTF8 : /* Evaluate the sequence of 2, 3 or 4 bytes as it is */
! 833: val = (c << 8) + (*seq++ & UCHARMAX);
! 834: if (char_type[ c & UCHARMAX] & U3_1) {
! 835: val = (val << 8) + (*seq++ & UCHARMAX);
! 836: } else if (char_type[ c & UCHARMAX] & U4_1) {
! 837: val = (val << 8) + (*seq++ & UCHARMAX);
! 838: val = (val << 8) + (*seq++ & UCHARMAX);
! 839: }
! 840: break;
! 841: }
! 842:
! 843: *seq_pp = seq;
! 844: return val;
! 845: }
! 846:
! 847: int last_is_mbchar(
! 848: const char * in, /* Input physical line */
! 849: int len /* Length of the line minus 2 */
! 850: )
! 851: /*
! 852: * Return 2, if the last char of the line is second byte of SJIS or BIGFIVE,
! 853: * else return 0.
! 854: */
! 855: {
! 856: const char * cp = in + len;
! 857: const char * const endp = in + len; /* -> the char befor '\n' */
! 858:
! 859: if ((mbchar & (SJIS | BIGFIVE)) == 0)
! 860: return 0;
! 861: while (in <= --cp) { /* Search backwardly */
! 862: if ((char_type[ *cp & UCHARMAX] & mbstart) == 0)
! 863: break; /* Not the first byte of MBCHAR */
! 864: }
! 865: if ((endp - cp) & 1)
! 866: return 0;
! 867: else
! 868: return 2;
! 869: }
! 870:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>