/* $OpenXM: OpenXM_contrib2/windows/mcpp/mbchar.c,v 1.1 2010/02/26 17:42:36 ohara Exp $ */ /*- * Copyright (c) 1998, 2002-2008 Kiyoshi Matsui * All rights reserved. * * Some parts of this code are derived from the public domain software * DECUS cpp (1984,1985) written by Martin Minow. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ /* * M B C H A R . C * C h a r a c t e r h a n d l i n g R o u t i n e s * * Character handling and multi-byte character handling routines are * placed here. */ #if PREPROCESSED #include "mcpp.H" #else #include "system.H" #include "internal.H" #endif /* * Tables of character types and multi-byte character types. * * Some of these character attributes will be overwritten by * execution time option '-@post' or '-@old'. * Warning on erroneous sequence will be issued from the caller routines: * scan_quote(), scan_id() or scan_number(). */ /* Non-ASCII characters are always checked by mb_read(). */ #define NA 0x4000 /* Non-ASCII characters */ /* Horizontal spaces (' ', '\t' and TOK_SEP) */ #define HSPA (SPA | HSP) short * char_type; /* Pointer to one of the following type_*[]. */ #define EJ1 0x100 /* 1st byte of EUC_JP */ #define EJ2 0x200 /* 2nd byte of EUC_JP */ #define GB1 0x400 /* 1st byte of GB2312 */ #define GB2 0x800 /* 2nd byte of GB2312 */ #define KS1 0x1000 /* 1st byte of KSC5601 */ #define KS2 0x2000 /* 2nd byte of KSC5601 */ #define EJ12 (EJ1 | EJ2) /* 1st byte or 2nd byte of EUC_JP */ #define GB12 (GB1 | GB2) #define KS12 (KS1 | KS2) #define EJ1N (NA | EJ1) #define EU12N (NA | EJ12 | GB12 | KS12) /* 1st or 2nd byte of EUC_JP, GB2312 or KSC5601, or any other non-ASCII */ static short type_euc[ UCHARMAX + 1] = { /* * For EUC_JP, GB2312, KSC5601 or other similar multi-byte char encodings. */ /* Character type codes */ /* 0, 1, 2, 3, 4, 5, 6, 7, */ /* 8, 9, A, B, C, D, E, F, Hex */ 000, 000, 000, 000, 000, 000, 000, 000, /* 00 */ 000, HSPA, SPA, SPA, SPA, SPA, 000, 000, /* 08 */ 000, 000, 000, 000, 000, 000, 000, 000, /* 10 */ /* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts() */ 000, LET, LET, 000, 000, 000, 000, HSPA, /* 18 */ HSPA, PUNC, QUO, PUNC, 000, PUNC, PUNC, QUO, /* 20 !"#$%&' */ PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, DOT, PUNC, /* 28 ()*+,-./ */ DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, /* 30 01234567 */ DIG, DIG, PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, /* 38 89:;<=>? */ 000, LET, LET, LET, LET, LET, LET, LET, /* 40 @ABCDEFG */ LET, LET, LET, LET, LET, LET, LET, LET, /* 48 HIJKLMNO */ LET, LET, LET, LET, LET, LET, LET, LET, /* 50 PQRSTUVW */ LET, LET, LET, PUNC, 000, PUNC, PUNC, LET, /* 58 XYZ[\]^_ */ 000, LET, LET, LET, LET, LET, LET, LET, /* 60 `abcdefg */ LET, LET, LET, LET, LET, LET, LET, LET, /* 68 hijklmno */ LET, LET, LET, LET, LET, LET, LET, LET, /* 70 pqrstuvw */ LET, LET, LET, PUNC, PUNC, PUNC, PUNC, 000, /* 78 xyz{|}~ */ NA, NA, NA, NA, NA, NA, NA, NA, /* 80 .. 87 */ NA, NA, NA, NA, NA, NA, EJ1N, NA, /* 88 .. 8F */ NA, NA, NA, NA, NA, NA, NA, NA, /* 90 .. 97 */ NA, NA, NA, NA, NA, NA, NA, NA, /* 98 .. 9F */ NA, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* A0 .. A7 */ EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* A8 .. AF */ EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* B0 .. B7 */ EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* B8 .. BF */ EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* C0 .. C7 */ EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* C8 .. CF */ EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* D0 .. D7 */ EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* D8 .. DF */ EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* E0 .. E7 */ EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* E8 .. EF */ EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, /* F0 .. F7 */ EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, NA, /* F8 .. FF */ }; static short type_bsl[ UCHARMAX + 1] = { /* * For SJIS, BIGFIVE or other similar encodings which may have '\\' value as * the second byte of multi-byte character. */ #define SJ1 0x100 /* 1st byte of SJIS */ #define SJ2 0x200 /* 2nd byte of SJIS */ #define BF1 0x400 /* 1st byte of BIGFIVE */ #define BF2 0x800 /* 2nd byte of BIGFIVE */ #define SB2 (SJ2 | BF2) #define SJ2N (NA | SJ2) #define SB2N (NA | SJ2 | BF2) #define SJ12N (NA | SJ1 | SJ2) #define BF12N (NA | BF1 | BF2) #define SB12N (NA | SJ1 | SJ2 | BF1 | BF2) #define S2B12N (NA | SJ2 | BF1 | BF2) #define LSB2 (LET | SB2) #define PSB2 (PUNC| SB2) /* Character type codes */ /* 0, 1, 2, 3, 4, 5, 6, 7, */ /* 8, 9, A, B, C, D, E, F, Hex */ 000, 000, 000, 000, 000, 000, 000, 000, /* 00 */ 000, HSPA, SPA, SPA, SPA, SPA, 000, 000, /* 08 */ 000, 000, 000, 000, 000, 000, 000, 000, /* 10 */ /* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts() */ 000, LET, LET, 000, 000, 000, 000, HSPA, /* 18 */ HSPA, PUNC, QUO, PUNC, 000, PUNC, PUNC, QUO, /* 20 !"#$%&' */ PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, DOT, PUNC, /* 28 ()*+,-./ */ DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, /* 30 01234567 */ DIG, DIG, PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, /* 38 89:;<=>? */ SB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 40 @ABCDEFG */ LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 48 HIJKLMNO */ LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 50 PQRSTUVW */ LSB2, LSB2, LSB2, PSB2, SB2, PSB2, PSB2, LSB2, /* 58 XYZ[\]^_ */ SB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 60 `abcdefg */ LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 68 hijklmno */ LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, LSB2, /* 70 pqrstuvw */ LSB2, LSB2, LSB2, PSB2, PSB2, PSB2, PSB2, 000, /* 78 xyz{|}~ */ SB2N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, /* 80 .. 87 */ SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, /* 88 .. 8F */ SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, /* 90 .. 97 */ SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, /* 98 .. 9F */ SJ2N, S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* A0 .. A7 */ S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* A8 .. AF */ S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* B0 .. B7 */ S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* B8 .. BF */ S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* C0 .. C7 */ S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* C8 .. CF */ S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* D0 .. D7 */ S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /* D8 .. DF */ SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, /* E0 .. E7 */ SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, /* E8 .. EF */ SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, /* F0 .. F7 */ SB12N, SB12N, SB12N, SB12N, SB12N, BF12N, BF12N, NA, /* F8 .. FF */ }; /* * For ISO2022_JP multi-byte character encoding. */ #define IS1 0x100 /* 1st byte of shift-sequence */ #define IS2 0x200 /* 2nd byte of shift-sequence */ #define IS3 0x400 /* 3rd byte of shift-sequence */ #define IS4 0x800 /* 4th byte of shift-sequence */ #define IJP 0x1000 /* 1st or 2nd byte of ISO-2022-JP (ISO-2022-JP1) */ #define PIJP (PUNC | IJP) #define QIJP (QUO | IJP) #define DTJP (DOT | IJP) #define DGJP (DIG | IJP) #define LIJP (LET | IJP) #define JPS2 (IJP | IS2) #define PJPS23 (PIJP | IS2 | IS3) #define LJPS3 (LIJP | IS3) #define LJPS4 (LIJP | IS4) static short type_iso2022_jp[ UCHARMAX + 1] = { /* Character type codes */ /* 0, 1, 2, 3, 4, 5, 6, 7, */ /* 8, 9, A, B, C, D, E, F, Hex */ 000, 000, 000, 000, 000, 000, 000, 000, /* 00 */ 000, HSPA, SPA, SPA, SPA, SPA, 000, 000, /* 08 */ 000, 000, 000, 000, 000, 000, 000, 000, /* 10 */ /* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts() */ 000, LET, LET, IS1, 000, 000, 000, HSPA, /* 18 */ HSPA, PIJP, QIJP, PIJP, JPS2, PIJP, PIJP, QIJP, /* 20 !"#$%&' */ PJPS23,PIJP, PIJP, PIJP, PIJP, PIJP, DTJP, PIJP, /* 28 ()*+,-./ */ DGJP, DGJP, DGJP, DGJP, DGJP, DGJP, DGJP, DGJP, /* 30 01234567 */ DGJP, DGJP, PIJP, PIJP, PIJP, PIJP, PIJP, PIJP, /* 38 89:;<=>? */ IJP, LIJP, LJPS3, LIJP, LJPS4, LIJP, LIJP, LIJP, /* 40 @ABCDEFG */ LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, /* 48 HIJKLMNO */ LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, /* 50 PQRSTUVW */ LIJP, LIJP, LIJP, PIJP, IJP, PIJP, PIJP, LIJP, /* 58 XYZ[\]^_ */ IJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, /* 60 `abcdefg */ LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, /* 68 hijklmno */ LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, LIJP, /* 70 pqrstuvw */ LIJP, LIJP, LIJP, PIJP, PIJP, PIJP, PIJP, 000, /* 78 xyz{|}~ */ NA, NA, NA, NA, NA, NA, NA, NA, /* 80 .. 87 */ NA, NA, NA, NA, NA, NA, NA, NA, /* 88 .. 8F */ NA, NA, NA, NA, NA, NA, NA, NA, /* 90 .. 97 */ NA, NA, NA, NA, NA, NA, NA, NA, /* 98 .. 9F */ NA, NA, NA, NA, NA, NA, NA, NA, /* A0 .. A7 */ NA, NA, NA, NA, NA, NA, NA, NA, /* A8 .. AF */ NA, NA, NA, NA, NA, NA, NA, NA, /* B0 .. B7 */ NA, NA, NA, NA, NA, NA, NA, NA, /* B8 .. BF */ NA, NA, NA, NA, NA, NA, NA, NA, /* C0 .. C7 */ NA, NA, NA, NA, NA, NA, NA, NA, /* C8 .. CF */ NA, NA, NA, NA, NA, NA, NA, NA, /* D0 .. D7 */ NA, NA, NA, NA, NA, NA, NA, NA, /* D8 .. DF */ NA, NA, NA, NA, NA, NA, NA, NA, /* E0 .. E7 */ NA, NA, NA, NA, NA, NA, NA, NA, /* E8 .. EF */ NA, NA, NA, NA, NA, NA, NA, NA, /* F0 .. F7 */ NA, NA, NA, NA, NA, NA, NA, NA, /* F8 .. FF */ }; /* * For UTF8 multi-byte character encoding. */ #define U2_1 0x100 /* 1st byte of 2-byte encoding of UTF8 */ #define U3_1 0x200 /* 1st byte of 3-byte encoding of UTF8 */ #define U4_1 0x400 /* 1st byte of 4-byte encoding of UTF8 */ #define UCONT 0x800 /* Continuation of a 2, 3, or 4 byte UTF8 sequence */ #define U2_1N (NA | U2_1) #define U3_1N (NA | U3_1) #define U4_1N (NA | U4_1) #define UCONTN (NA | UCONT) static short type_utf8[ UCHARMAX + 1] = { /* Character type codes */ /* 0, 1, 2, 3, 4, 5, 6, 7, */ /* 8, 9, A, B, C, D, E, F, Hex */ 000, 000, 000, 000, 000, 000, 000, 000, /* 00 */ 000, HSPA, SPA, SPA, SPA, SPA, 000, 000, /* 08 */ 000, 000, 000, 000, 000, 000, 000, 000, /* 10 */ /* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts() */ 000, LET, LET, 000, 000, 000, 000, HSPA, /* 18 */ HSPA, PUNC, QUO, PUNC, 000, PUNC, PUNC, QUO, /* 20 !"#$%&' */ PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, DOT, PUNC, /* 28 ()*+,-./ */ DIG, DIG, DIG, DIG, DIG, DIG, DIG, DIG, /* 30 01234567 */ DIG, DIG, PUNC, PUNC, PUNC, PUNC, PUNC, PUNC, /* 38 89:;<=>? */ 000, LET, LET, LET, LET, LET, LET, LET, /* 40 @ABCDEFG */ LET, LET, LET, LET, LET, LET, LET, LET, /* 48 HIJKLMNO */ LET, LET, LET, LET, LET, LET, LET, LET, /* 50 PQRSTUVW */ LET, LET, LET, PUNC, 000, PUNC, PUNC, LET, /* 58 XYZ[\]^_ */ 000, LET, LET, LET, LET, LET, LET, LET, /* 60 `abcdefg */ LET, LET, LET, LET, LET, LET, LET, LET, /* 68 hijklmno */ LET, LET, LET, LET, LET, LET, LET, LET, /* 70 pqrstuvw */ LET, LET, LET, PUNC, PUNC, PUNC, PUNC, 000, /* 78 xyz{|}~ */ UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* 80 .. 87 */ UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* 88 .. 8F */ UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* 90 .. 97 */ UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* 98 .. 9F */ UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* A0 .. A7 */ UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* A8 .. AF */ UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* B0 .. B7 */ UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /* B8 .. BF */ NA, NA, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, /* C0 .. C7 */ U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, /* C8 .. CF */ U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, /* D0 .. D7 */ U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, /* D8 .. DF */ U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, /* E0 .. E7 */ U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, /* E8 .. EF */ U4_1N, U4_1N, U4_1N, U4_1N, U4_1N, NA, NA, NA, /* F0 .. F7 */ NA, NA, NA, NA, NA, NA, NA, NA, /* F8 .. FF */ }; #define SETLOCALE 2 /* #pragma setlocale (not __setlocale) */ #define NUM_ENCODING 8 #define NUM_ALIAS 6 /* Names of encoding recognized. Table for search_encoding(). */ static const char * const encoding_name[ NUM_ENCODING][ NUM_ALIAS] = { /* Visual C full, Visual C short , 4 miscellaneous */ { "english", "c" , "c", "en", "latin", "iso8859"}, { "", "" , "eucjp", "euc", "ujis", ""}, { "chinesesimplified", "chs" , "gb2312", "cngb", "euccn", ""}, { "korean", "kor" , "ksc5601", "ksx1001", "wansung", "euckr"}, { "japanese", "jpn" , "sjis", "shiftjis", "mskanji", ""}, { "chinesetraditional", "cht" , "bigfive", "big5", "cnbig5", "euctw"}, { "", "" , "iso2022jp", "iso2022jp1", "jis", ""}, { "", "" , "utf8", "utf", "", ""}, }; static int mbstart; static int mb2; static size_t mb_read_2byte( int c1, char ** in_pp, char ** out_pp); /* For 2-byte encodings of mbchar */ static const char * search_encoding( char * norm, int alias); /* Search encoding_name[][] table */ static void strip_bar( char * string); /* Remove '_', '-' or '.' in the string */ static void conv_case( char * name, char * lim, int upper); /* Convert to upper/lower case */ static size_t mb_read_iso2022_jp( int c1, char ** in_pp, char ** out_pp); /* For ISO2022_JP encoding */ static size_t mb_read_utf8( int c1, char ** in_pp, char ** out_pp); /* For UTF8 mbchar encoding */ #define NAMLEN 20 #define UPPER 1 /* To upper */ #define LOWER 0 /* To lower */ const char * set_encoding( char * name, /* Name of encoding specified */ char * env, /* Name of environment variable */ int pragma /* 2: #pragma setlocale, 1: #pragma __setlocale, 0: not #pragma */ ) /* * Search the encoding specified and re-initialize mbchar settings. */ { const char * unknown_encoding = "Unknown encoding: %s%.0ld%.0s"; /* _W1_ */ const char * too_long = "Too long encoding name: %s%.0ld%.0s"; /* _E_ */ const char * loc = ""; int alias; char norm[ NAMLEN]; /* * Normalized name (removed 'xxxxx.', stripped '_', '-', '.' * and lowered. */ if (strlen( name) >= NAMLEN) { if ((env || pragma) && (warn_level & 1)) { cwarn( too_long, name, 0L, NULL); } else { mcpp_fprintf( ERR, too_long, name); mcpp_fputc( '\n', ERR); } } strcpy( norm, name); if (norm[ 5] == '.') memmove( norm, norm + 5, strlen( norm + 5) + 1); /* Remove initial 'xxxxx.' as 'ja_JP.', 'en_US.' or any other */ conv_case( norm, norm + strlen( norm), LOWER); strip_bar( norm); if (strlen( name) == 0) { /* "" */ mbchar = MBCHAR; /* Restore to the default encoding */ } else if (memcmp( norm, "iso8859", 7) == 0 /* iso8859* */ || memcmp( norm, "latin", 5) == 0 /* latin* */ || memcmp( norm, "en", 2) == 0) { /* en* */ mbchar = 0; /* No multi-byte character */ } else { alias = 2; #if COMPILER == MSC if (pragma == SETLOCALE) /* #pragma setlocale */ alias = 0; #endif loc = search_encoding( norm, alias); /* Search the name */ } if (loc == NULL) { if ((env || pragma) && (warn_level & 1)) { cwarn( unknown_encoding, name, 0L, NULL); } else { /* -m option */ mcpp_fprintf( ERR, unknown_encoding, name); mcpp_fputc( '\n', ERR); } } else { mb_init(); /* Re-initialize */ } return loc; } static const char * search_encoding( char * norm, /* The name of encoding specified */ int alias /* The number of alias to start searching */ ) { const char * loc; int lo, al; for (lo = 0; lo < NUM_ENCODING; lo++) { for (al = alias ; al < NUM_ALIAS; al++) { loc = encoding_name[ lo][ al]; if (str_eq( loc, norm)) { switch (lo) { case 0 : mbchar = 0; break; case 1 : mbchar = EUC_JP; break; case 2 : mbchar = GB2312; break; case 3 : mbchar = KSC5601; break; case 4 : mbchar = SJIS; break; case 5 : mbchar = BIGFIVE; break; case 6 : mbchar = ISO2022_JP; break; case 7 : mbchar = UTF8; break; } return loc; } } } return NULL; } static void strip_bar( char * string ) /* * Strip '_', '-' or '.' in the string. */ { char * cp = string; while (*cp != EOS) { if (*cp == '_' || *cp == '-' || *cp == '.') memmove( cp, cp + 1, strlen( cp)); else cp++; } } static void conv_case( char * name, /* (diretory) Name */ char * lim, /* End of (directory) name */ int upper /* TRUE if to upper */ ) /* Convert a string to upper-case letters or lower-case letters in-place */ { int c; char * sp; for (sp = name; sp < lim; sp++) { c = *sp & UCHARMAX; #if MBCHAR if ((char_type[ c] & mbstart)) { char tmp[ PATHMAX+1]; char * tp = tmp; *tp++ = *sp++; mb_read( c, &sp, &tp); } else #endif { if (upper) *sp = toupper( c); else *sp = tolower( c); } } } void mb_init( void) /* * Initialize multi-byte character settings. * First called prior to setting the 'mcpp_mode'. * Will be called again each time the multibyte character encoding is changed. */ { /* * Select the character classification table, select the multi-byte * character reading routine and decide whether multi-byte character * may contain the byte of value 0x5c. */ switch (mbchar) { case 0 : case EUC_JP : case GB2312 : case KSC5601 : char_type = type_euc; bsl_in_mbchar = FALSE; mb_read = mb_read_2byte; break; case SJIS : case BIGFIVE : char_type = type_bsl; bsl_in_mbchar = TRUE; mb_read = mb_read_2byte; break; case ISO2022_JP : char_type = type_iso2022_jp; bsl_in_mbchar = TRUE; mb_read = mb_read_iso2022_jp; break; case UTF8 : char_type = type_utf8; bsl_in_mbchar = FALSE; mb_read = mb_read_utf8; break; } /* Set the bit patterns for character classification. */ switch (mbchar) { case 0 : mbstart = 0; break; case EUC_JP : mbstart = EJ1; mb2 = EJ2; break; case GB2312 : mbstart = GB1; mb2 = GB2; break; case KSC5601: mbstart = KS1; mb2 = KS2; break; case SJIS : mbstart = SJ1; mb2 = SJ2; break; case BIGFIVE: mbstart = BF1; mb2 = BF2; break; case ISO2022_JP : mbstart = IS1; break; case UTF8 : mbstart = (U2_1 | U3_1 | U4_1); break; } switch (mbchar) { case 0 : mbchk = 0; break; case EUC_JP : case GB2312 : case KSC5601: case SJIS : case BIGFIVE: case UTF8 : mbchk = NA; break; case ISO2022_JP : mbchk = (IS1 | NA); break; } /* * Set special handling for some encodings to supplement some compiler's * deficiency. */ switch (mbchar) { case SJIS : #if ! SJIS_IS_ESCAPE_FREE bsl_need_escape = TRUE; #endif break; case BIGFIVE: #if ! BIGFIVE_IS_ESCAPE_FREE bsl_need_escape = TRUE; #endif break; case ISO2022_JP : #if ! ISO2022_JP_IS_ESCAPE_FREE bsl_need_escape = TRUE; #endif break; default : bsl_need_escape = FALSE; break; } /* * Modify magic characters in character type table. * char_type[] table should be rewritten in accordance with the 'mcpp_mode' * whenever the encoding is changed. */ if (mcpp_mode) { /* If mcpp_mode is already set */ char_type[ DEF_MAGIC] = standard ? LET : 0; char_type[ IN_SRC] = (mcpp_mode == STD) ? LET : 0; char_type[ TOK_SEP] = (mcpp_mode == STD || mcpp_mode == OLD_PREP) ? HSPA: 0; /* TOK_SEP equals to COM_SEP */ } } static size_t mb_read_2byte( int c1, /* The 1st byte of mbchar sequence (already read) */ char ** in_pp, /* Pointer to input */ char ** out_pp /* Pointer to output */ ) /* * Multi-byte character reading routine for 2-byte encodings. */ { int error = FALSE; size_t len = 0; /* Number of multi-byte characters read. */ char * in_p = *in_pp; char * out_p = *out_pp; if (! (char_type[ c1 & UCHARMAX] & mbstart)) return MB_ERROR; /* Not a multi-byte character */ do { if (! (char_type[ (*out_p++ = *in_p++) & UCHARMAX] & mb2)) { error = TRUE; break; } len++; } while (char_type[ (*out_p++ = *in_p++) & UCHARMAX] & mbstart); *in_pp = --in_p; *(--out_p) = EOS; *out_pp = out_p; return error ? (len | MB_ERROR) : len; } static size_t mb_read_iso2022_jp( int c1, /* The 1st byte of the sequence already read (always 0x1b). */ char ** in_pp, char ** out_pp ) /* * Multi-byte character reading routine for ISO2022_JP. */ { int error = FALSE; size_t len = 0; char * in_p = *in_pp; char * out_p = *out_pp; int c2, c3, c4; if (! (char_type[ c1 & UCHARMAX] & mbstart)) return MB_ERROR; do { *out_p++ = c2 = *in_p++; if (! (char_type[ c2 & UCHARMAX] & IS2)) { error = TRUE; break; } *out_p++ = c3 = *in_p++; if (! (char_type[ c3 & UCHARMAX] & IS3)) { error = TRUE; break; } switch (c2) { case 0x24 : switch (c3) { case 0x42 : /* 0x1b 0x24 0x42: JIS X 0208-1983 */ break; case 0x28 : *out_p++ = c4 = *in_p++; if (! (char_type[ c4 & UCHARMAX] & IS4)) error = TRUE; /* else: 0x1b 0x24 0x28 0x44: JIS X 0212 */ break; default : error = TRUE; } break; case 0x28 : switch (c3) { case 0x42 : /* 0x1b 0x28 0x42: ASCII */ c1 = *out_p++ = *in_p++ & UCHARMAX; continue; default : error = TRUE; } break; } if (error) break; while (char_type[ c1 = *out_p++ = (*in_p++ & UCHARMAX)] & IJP) { if (! (char_type[ *out_p++ = (*in_p++ & UCHARMAX)] & IJP)) { error = TRUE; break; } len++; /* String of multi-byte characters */ } if (error) break; } while (char_type[ c1] & IS1); /* 0x1b: start of shift-sequence */ *in_pp = --in_p; *(--out_p) = EOS; *out_pp = out_p; return error ? (len | MB_ERROR) : len; } static size_t mb_read_utf8( int c1, char ** in_pp, char ** out_pp ) /* * Multi-byte character reading routine for UTF8. */ { int error = FALSE; size_t len = 0; char * in_p = *in_pp; char * out_p = *out_pp; if (! (char_type[ c1 & UCHARMAX] & mbstart)) return MB_ERROR; do { unsigned int codepoint; int i, bytes; if ((char_type[ c1 & UCHARMAX] & U4_1) == U4_1) bytes = 4; /* 4-byte character */ else if ((char_type[ c1 & UCHARMAX] & U3_1) == U3_1) bytes = 3; /* 3-byte character */ else if ((char_type[ c1 & UCHARMAX] & U2_1) == U2_1) bytes = 2; /* 2-byte character */ /* Must ensure that the sequence is not reserved as a surrogate */ codepoint = ((2 << (6-bytes)) - 1) & c1; /* mask off top bits */ /* All bytes left in the sequence must be in 0x80 - 0xBF */ for (i = bytes - 1; i && !error; i--) { codepoint = (codepoint << 6) + ((*in_p) & 0x3fU); if (! (char_type[ (*out_p++ = *in_p++) & UCHARMAX] & UCONT)) error = TRUE; } /* Check for overlong/underlong sequences */ if ((bytes == 2 && (codepoint < 0x80 || codepoint > 0x7FF)) || (bytes == 3 && (codepoint < 0x800 || codepoint > 0xFFFF)) || (bytes == 4 && (codepoint < 0x10000 || codepoint > 0x10FFFF))) error = TRUE; if ((codepoint >= 0xD800 && codepoint <= 0xDFFF) /* Check for reserved surrogate codepoints */ || (codepoint >= 0xFFFE && codepoint <= 0xFFFF)) /* Illegal */ error = TRUE; #if 0 printf( "codepoint:0x%x\n", codepoint); #endif if (error) break; len++; } while (char_type[ (*out_p++ = c1 = *in_p++) & UCHARMAX] & mbstart); /* Start of the next multi-byte character */ *in_pp = --in_p; *(--out_p) = EOS; *out_pp = out_p; return error ? (len | MB_ERROR) : len; } uexpr_t mb_eval( char ** seq_pp ) /* * Evaluate the value of a multi-byte character. * This routine does not check the legality of the sequence. * This routine is called from eval_char(). * This routine is never called in POST_STD mode. */ { char * seq = *seq_pp; uexpr_t val = 0; int c, c1; if (! (char_type[ c = *seq++ & UCHARMAX] & mbstart)) { *seq_pp = seq; return c; /* Not a multi-byte character */ } switch (mbchar) { case EUC_JP : case GB2312 : case KSC5601: case SJIS : case BIGFIVE: val = (c << 8) + (*seq++ & UCHARMAX); /* Evaluate the 2-byte sequence */ break; case ISO2022_JP : if (char_type[ c & UCHARMAX] & IS1) { /* Skip shift-sequence */ if (char_type[ c = *seq++ & UCHARMAX] & IS2) { if (char_type[ c1 = *seq++ & UCHARMAX] & IS3) { if (c1 == 0x28) seq++; if (c == 0x28 && c1 == 0x42) { /* Shift-out sequence */ val = 0; break; } c = *seq++ & UCHARMAX; } } } val = (c << 8) + (*seq++ & UCHARMAX); /* Evaluate the 2-bytes */ break; case UTF8 : /* Evaluate the sequence of 2, 3 or 4 bytes as it is */ val = (c << 8) + (*seq++ & UCHARMAX); if (char_type[ c & UCHARMAX] & U3_1) { val = (val << 8) + (*seq++ & UCHARMAX); } else if (char_type[ c & UCHARMAX] & U4_1) { val = (val << 8) + (*seq++ & UCHARMAX); val = (val << 8) + (*seq++ & UCHARMAX); } break; } *seq_pp = seq; return val; } int last_is_mbchar( const char * in, /* Input physical line */ int len /* Length of the line minus 2 */ ) /* * Return 2, if the last char of the line is second byte of SJIS or BIGFIVE, * else return 0. */ { const char * cp = in + len; const char * const endp = in + len; /* -> the char befor '\n' */ if ((mbchar & (SJIS | BIGFIVE)) == 0) return 0; while (in <= --cp) { /* Search backwardly */ if ((char_type[ *cp & UCHARMAX] & mbstart) == 0) break; /* Not the first byte of MBCHAR */ } if ((endp - cp) & 1) return 0; else return 2; }