OpenXM_contrib2/windows/mcpp/mbchar.c - annotate

Return to mbchar.c CVS log
Up to [local] / OpenXM_contrib2 / windows / mcpp
Annotation of OpenXM_contrib2/windows/mcpp/mbchar.c, Revision 1.1

1.1     ! ohara       1: /* $OpenXM$ */
        !             2: /*-
        !             3:  * Copyright (c) 1998, 2002-2008 Kiyoshi Matsui <kmatsui@t3.rim.or.jp>
        !             4:  * All rights reserved.
        !             5:  *
        !             6:  * Some parts of this code are derived from the public domain software
        !             7:  * DECUS cpp (1984,1985) written by Martin Minow.
        !             8:  *
        !             9:  * Redistribution and use in source and binary forms, with or without
        !            10:  * modification, are permitted provided that the following conditions
        !            11:  * are met:
        !            12:  * 1. Redistributions of source code must retain the above copyright
        !            13:  *    notice, this list of conditions and the following disclaimer.
        !            14:  * 2. Redistributions in binary form must reproduce the above copyright
        !            15:  *    notice, this list of conditions and the following disclaimer in the
        !            16:  *    documentation and/or other materials provided with the distribution.
        !            17:  *
        !            18:  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
        !            19:  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
        !            20:  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
        !            21:  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE
        !            22:  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
        !            23:  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
        !            24:  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
        !            25:  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
        !            26:  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
        !            27:  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
        !            28:  * SUCH DAMAGE.
        !            29:  */
        !            30:
        !            31: /*
        !            32:  *                          M B C H A R . C
        !            33:  *      C h a r a c t e r    h a n d l i n g    R o u t i n e s
        !            34:  *
        !            35:  * Character handling and multi-byte character handling routines are
        !            36:  * placed here.
        !            37:  */
        !            38:
        !            39: #if PREPROCESSED
        !            40: #include    "mcpp.H"
        !            41: #else
        !            42: #include    "system.H"
        !            43: #include    "internal.H"
        !            44: #endif
        !            45:
        !            46: /*
        !            47:  * Tables of character types and multi-byte character types.
        !            48:  *
        !            49:  * Some of these character attributes will be overwritten by
        !            50:  *      execution time option '-@post' or '-@old'.
        !            51:  * Warning on erroneous sequence will be issued from the caller routines:
        !            52:  * scan_quote(), scan_id() or scan_number().
        !            53:  */
        !            54:
        !            55: /* Non-ASCII characters are always checked by mb_read().    */
        !            56: #define NA      0x4000  /* Non-ASCII characters */
        !            57:
        !            58: /* Horizontal spaces (' ', '\t' and TOK_SEP)    */
        !            59: #define HSPA    (SPA | HSP)
        !            60:
        !            61: short *     char_type;  /* Pointer to one of the following type_*[].    */
        !            62:
        !            63: #define EJ1     0x100   /* 1st byte of EUC_JP   */
        !            64: #define EJ2     0x200   /* 2nd byte of EUC_JP   */
        !            65: #define GB1     0x400   /* 1st byte of GB2312   */
        !            66: #define GB2     0x800   /* 2nd byte of GB2312   */
        !            67: #define KS1     0x1000  /* 1st byte of KSC5601  */
        !            68: #define KS2     0x2000  /* 2nd byte of KSC5601  */
        !            69:
        !            70: #define EJ12    (EJ1 | EJ2)     /* 1st byte or 2nd byte of EUC_JP   */
        !            71: #define GB12    (GB1 | GB2)
        !            72: #define KS12    (KS1 | KS2)
        !            73: #define EJ1N    (NA | EJ1)
        !            74: #define EU12N   (NA | EJ12 | GB12 | KS12)
        !            75:     /* 1st or 2nd byte of EUC_JP, GB2312 or KSC5601, or any other non-ASCII */
        !            76:
        !            77: static short    type_euc[ UCHARMAX + 1] = {
        !            78: /*
        !            79:  * For EUC_JP, GB2312, KSC5601 or other similar multi-byte char encodings.
        !            80:  */
        !            81:
        !            82: /* Character type codes */
        !            83: /*   0,     1,     2,     3,     4,     5,     6,     7,                    */
        !            84: /*   8,     9,     A,     B,     C,     D,     E,     F,       Hex          */
        !            85:
        !            86:    000,   000,   000,   000,   000,   000,   000,   000,    /* 00           */
        !            87:    000,   HSPA,  SPA,   SPA,   SPA,   SPA,   000,   000,    /* 08           */
        !            88:    000,   000,   000,   000,   000,   000,   000,   000,    /* 10           */
        !            89:     /* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts()       */
        !            90:    000,   LET,   LET,   000,   000,   000,   000,   HSPA,   /* 18           */
        !            91:    HSPA,  PUNC,  QUO,   PUNC,  000,   PUNC,  PUNC,  QUO,    /* 20  !"#$%&'  */
        !            92:    PUNC,  PUNC,  PUNC,  PUNC,  PUNC,  PUNC,  DOT,   PUNC,   /* 28 ()*+,-./  */
        !            93:    DIG,   DIG,   DIG,   DIG,   DIG,   DIG,   DIG,   DIG,    /* 30 01234567  */
        !            94:    DIG,   DIG,   PUNC,  PUNC,  PUNC,  PUNC,  PUNC,  PUNC,   /* 38 89:;<=>?  */
        !            95:
        !            96:    000,   LET,   LET,   LET,   LET,   LET,   LET,   LET,    /* 40 @ABCDEFG  */
        !            97:    LET,   LET,   LET,   LET,   LET,   LET,   LET,   LET,    /* 48 HIJKLMNO  */
        !            98:    LET,   LET,   LET,   LET,   LET,   LET,   LET,   LET,    /* 50 PQRSTUVW  */
        !            99:    LET,   LET,   LET,   PUNC,  000,   PUNC,  PUNC,  LET,    /* 58 XYZ[\]^_  */
        !           100:    000,   LET,   LET,   LET,   LET,   LET,   LET,   LET,    /* 60 `abcdefg  */
        !           101:    LET,   LET,   LET,   LET,   LET,   LET,   LET,   LET,    /* 68 hijklmno  */
        !           102:    LET,   LET,   LET,   LET,   LET,   LET,   LET,   LET,    /* 70 pqrstuvw  */
        !           103:    LET,   LET,   LET,   PUNC,  PUNC,  PUNC,  PUNC,  000,    /* 78 xyz{|}~   */
        !           104:
        !           105:    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   80 .. 87   */
        !           106:    NA,    NA,    NA,    NA,    NA,    NA,    EJ1N,  NA,     /*   88 .. 8F   */
        !           107:    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   90 .. 97   */
        !           108:    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   98 .. 9F   */
        !           109:    NA,    EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N,  /*   A0 .. A7   */
        !           110:    EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N,  /*   A8 .. AF   */
        !           111:    EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N,  /*   B0 .. B7   */
        !           112:    EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N,  /*   B8 .. BF   */
        !           113:    EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N,  /*   C0 .. C7   */
        !           114:    EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N,  /*   C8 .. CF   */
        !           115:    EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N,  /*   D0 .. D7   */
        !           116:    EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N,  /*   D8 .. DF   */
        !           117:    EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N,  /*   E0 .. E7   */
        !           118:    EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N,  /*   E8 .. EF   */
        !           119:    EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N,  /*   F0 .. F7   */
        !           120:    EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, EU12N, NA,     /*   F8 .. FF   */
        !           121: };
        !           122:
        !           123: static short    type_bsl[ UCHARMAX + 1] = {
        !           124: /*
        !           125:  * For SJIS, BIGFIVE or other similar encodings which may have '\\' value as
        !           126:  * the second byte of multi-byte character.
        !           127:  */
        !           128:
        !           129: #define SJ1     0x100   /* 1st byte of SJIS     */
        !           130: #define SJ2     0x200   /* 2nd byte of SJIS     */
        !           131: #define BF1     0x400   /* 1st byte of BIGFIVE  */
        !           132: #define BF2     0x800   /* 2nd byte of BIGFIVE  */
        !           133:
        !           134: #define SB2     (SJ2 | BF2)
        !           135: #define SJ2N    (NA | SJ2)
        !           136: #define SB2N    (NA | SJ2 | BF2)
        !           137: #define SJ12N   (NA | SJ1 | SJ2)
        !           138: #define BF12N   (NA | BF1 | BF2)
        !           139: #define SB12N   (NA | SJ1 | SJ2 | BF1 | BF2)
        !           140: #define S2B12N  (NA | SJ2 | BF1 | BF2)
        !           141:
        !           142: #define LSB2    (LET | SB2)
        !           143: #define PSB2    (PUNC| SB2)
        !           144:
        !           145: /* Character type codes */
        !           146: /*   0,     1,     2,     3,     4,     5,     6,     7,                    */
        !           147: /*   8,     9,     A,     B,     C,     D,     E,     F,       Hex          */
        !           148:
        !           149:    000,   000,   000,   000,   000,   000,   000,   000,    /* 00           */
        !           150:    000,   HSPA,  SPA,   SPA,   SPA,   SPA,   000,   000,    /* 08           */
        !           151:    000,   000,   000,   000,   000,   000,   000,   000,    /* 10           */
        !           152:     /* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts()       */
        !           153:    000,   LET,   LET,   000,   000,   000,   000,   HSPA,   /* 18           */
        !           154:    HSPA,  PUNC,  QUO,   PUNC,  000,   PUNC,  PUNC,  QUO,    /* 20  !"#$%&'  */
        !           155:    PUNC,  PUNC,  PUNC,  PUNC,  PUNC,  PUNC,  DOT,   PUNC,   /* 28 ()*+,-./  */
        !           156:    DIG,   DIG,   DIG,   DIG,   DIG,   DIG,   DIG,   DIG,    /* 30 01234567  */
        !           157:    DIG,   DIG,   PUNC,  PUNC,  PUNC,  PUNC,  PUNC,  PUNC,   /* 38 89:;<=>?  */
        !           158:
        !           159:    SB2,   LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,   /* 40 @ABCDEFG  */
        !           160:    LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,   /* 48 HIJKLMNO  */
        !           161:    LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,   /* 50 PQRSTUVW  */
        !           162:    LSB2,  LSB2,  LSB2,  PSB2,  SB2,   PSB2,  PSB2,  LSB2,   /* 58 XYZ[\]^_  */
        !           163:    SB2,   LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,   /* 60 `abcdefg  */
        !           164:    LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,   /* 68 hijklmno  */
        !           165:    LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,  LSB2,   /* 70 pqrstuvw  */
        !           166:    LSB2,  LSB2,  LSB2,  PSB2,  PSB2,  PSB2,  PSB2,  000,    /* 78 xyz{|}~   */
        !           167:
        !           168:    SB2N,  SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N,  /*   80 .. 87   */
        !           169:    SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N,  /*   88 .. 8F   */
        !           170:    SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N,  /*   90 .. 97   */
        !           171:    SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N, SJ12N,  /*   98 .. 9F   */
        !           172:    SJ2N,  S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /*   A0 .. A7   */
        !           173:    S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /*   A8 .. AF   */
        !           174:    S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /*   B0 .. B7   */
        !           175:    S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /*   B8 .. BF   */
        !           176:    S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /*   C0 .. C7   */
        !           177:    S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /*   C8 .. CF   */
        !           178:    S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /*   D0 .. D7   */
        !           179:    S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N,S2B12N, /*   D8 .. DF   */
        !           180:    SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N,  /*   E0 .. E7   */
        !           181:    SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N,  /*   E8 .. EF   */
        !           182:    SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N, SB12N,  /*   F0 .. F7   */
        !           183:    SB12N, SB12N, SB12N, SB12N, SB12N, BF12N, BF12N, NA,     /*   F8 .. FF   */
        !           184: };
        !           185:
        !           186: /*
        !           187:  * For ISO2022_JP multi-byte character encoding.
        !           188:  */
        !           189:
        !           190: #define IS1     0x100   /* 1st byte of shift-sequence   */
        !           191: #define IS2     0x200   /* 2nd byte of shift-sequence   */
        !           192: #define IS3     0x400   /* 3rd byte of shift-sequence   */
        !           193: #define IS4     0x800   /* 4th byte of shift-sequence   */
        !           194: #define IJP     0x1000  /* 1st or 2nd byte of ISO-2022-JP (ISO-2022-JP1)    */
        !           195:
        !           196: #define PIJP    (PUNC | IJP)
        !           197: #define QIJP    (QUO | IJP)
        !           198: #define DTJP    (DOT | IJP)
        !           199: #define DGJP    (DIG | IJP)
        !           200: #define LIJP    (LET | IJP)
        !           201:
        !           202: #define JPS2    (IJP | IS2)
        !           203: #define PJPS23  (PIJP | IS2 | IS3)
        !           204: #define LJPS3   (LIJP | IS3)
        !           205: #define LJPS4   (LIJP | IS4)
        !           206:
        !           207: static short    type_iso2022_jp[ UCHARMAX + 1] = {
        !           208:
        !           209: /* Character type codes */
        !           210: /*   0,     1,     2,     3,     4,     5,     6,     7,                    */
        !           211: /*   8,     9,     A,     B,     C,     D,     E,     F,       Hex          */
        !           212:
        !           213:    000,   000,   000,   000,   000,   000,   000,   000,    /* 00           */
        !           214:    000,   HSPA,  SPA,   SPA,   SPA,   SPA,   000,   000,    /* 08           */
        !           215:    000,   000,   000,   000,   000,   000,   000,   000,    /* 10           */
        !           216:     /* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts()       */
        !           217:    000,   LET,   LET,   IS1,   000,   000,   000,   HSPA,   /* 18           */
        !           218:    HSPA,  PIJP,  QIJP,  PIJP,  JPS2,  PIJP,  PIJP,  QIJP,   /* 20  !"#$%&'  */
        !           219:    PJPS23,PIJP,  PIJP,  PIJP,  PIJP,  PIJP,  DTJP,  PIJP,   /* 28 ()*+,-./  */
        !           220:    DGJP,  DGJP,  DGJP,  DGJP,  DGJP,  DGJP,  DGJP,  DGJP,   /* 30 01234567  */
        !           221:    DGJP,  DGJP,  PIJP,  PIJP,  PIJP,  PIJP,  PIJP,  PIJP,   /* 38 89:;<=>?  */
        !           222:
        !           223:    IJP,   LIJP,  LJPS3, LIJP,  LJPS4, LIJP,  LIJP,  LIJP,   /* 40 @ABCDEFG  */
        !           224:    LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,   /* 48 HIJKLMNO  */
        !           225:    LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,   /* 50 PQRSTUVW  */
        !           226:    LIJP,  LIJP,  LIJP,  PIJP,  IJP,   PIJP,  PIJP,  LIJP,   /* 58 XYZ[\]^_  */
        !           227:    IJP,   LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,   /* 60 `abcdefg  */
        !           228:    LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,   /* 68 hijklmno  */
        !           229:    LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,  LIJP,   /* 70 pqrstuvw  */
        !           230:    LIJP,  LIJP,  LIJP,  PIJP,  PIJP,  PIJP,  PIJP,  000,    /* 78 xyz{|}~   */
        !           231:
        !           232:    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   80 .. 87   */
        !           233:    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   88 .. 8F   */
        !           234:    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   90 .. 97   */
        !           235:    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   98 .. 9F   */
        !           236:    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   A0 .. A7   */
        !           237:    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   A8 .. AF   */
        !           238:    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   B0 .. B7   */
        !           239:    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   B8 .. BF   */
        !           240:    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   C0 .. C7   */
        !           241:    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   C8 .. CF   */
        !           242:    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   D0 .. D7   */
        !           243:    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   D8 .. DF   */
        !           244:    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   E0 .. E7   */
        !           245:    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   E8 .. EF   */
        !           246:    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   F0 .. F7   */
        !           247:    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   F8 .. FF   */
        !           248: };
        !           249:
        !           250: /*
        !           251:  * For UTF8 multi-byte character encoding.
        !           252:  */
        !           253:
        !           254: #define U2_1    0x100       /* 1st byte of 2-byte encoding of UTF8  */
        !           255: #define U3_1    0x200       /* 1st byte of 3-byte encoding of UTF8  */
        !           256: #define U4_1    0x400       /* 1st byte of 4-byte encoding of UTF8  */
        !           257: #define UCONT   0x800   /* Continuation of a 2, 3, or 4 byte UTF8 sequence  */
        !           258: #define U2_1N   (NA | U2_1)
        !           259: #define U3_1N   (NA | U3_1)
        !           260: #define U4_1N   (NA | U4_1)
        !           261: #define UCONTN  (NA | UCONT)
        !           262:
        !           263: static short    type_utf8[ UCHARMAX + 1] = {
        !           264:
        !           265: /* Character type codes */
        !           266: /*   0,     1,     2,     3,     4,     5,     6,     7,                    */
        !           267: /*   8,     9,     A,     B,     C,     D,     E,     F,       Hex          */
        !           268:
        !           269:    000,   000,   000,   000,   000,   000,   000,   000,    /* 00           */
        !           270:    000,   HSPA,  SPA,   SPA,   SPA,   SPA,   000,   000,    /* 08           */
        !           271:    000,   000,   000,   000,   000,   000,   000,   000,    /* 10           */
        !           272:     /* 0x17-0x1A and 0x1F will be cleared in some modes by chk_opts()       */
        !           273:    000,   LET,   LET,   000,   000,   000,   000,   HSPA,   /* 18           */
        !           274:    HSPA,  PUNC,  QUO,   PUNC,  000,   PUNC,  PUNC,  QUO,    /* 20  !"#$%&'  */
        !           275:    PUNC,  PUNC,  PUNC,  PUNC,  PUNC,  PUNC,  DOT,   PUNC,   /* 28 ()*+,-./  */
        !           276:    DIG,   DIG,   DIG,   DIG,   DIG,   DIG,   DIG,   DIG,    /* 30 01234567  */
        !           277:    DIG,   DIG,   PUNC,  PUNC,  PUNC,  PUNC,  PUNC,  PUNC,   /* 38 89:;<=>?  */
        !           278:
        !           279:    000,   LET,   LET,   LET,   LET,   LET,   LET,   LET,    /* 40 @ABCDEFG  */
        !           280:    LET,   LET,   LET,   LET,   LET,   LET,   LET,   LET,    /* 48 HIJKLMNO  */
        !           281:    LET,   LET,   LET,   LET,   LET,   LET,   LET,   LET,    /* 50 PQRSTUVW  */
        !           282:    LET,   LET,   LET,   PUNC,  000,   PUNC,  PUNC,  LET,    /* 58 XYZ[\]^_  */
        !           283:    000,   LET,   LET,   LET,   LET,   LET,   LET,   LET,    /* 60 `abcdefg  */
        !           284:    LET,   LET,   LET,   LET,   LET,   LET,   LET,   LET,    /* 68 hijklmno  */
        !           285:    LET,   LET,   LET,   LET,   LET,   LET,   LET,   LET,    /* 70 pqrstuvw  */
        !           286:    LET,   LET,   LET,   PUNC,  PUNC,  PUNC,  PUNC,  000,    /* 78 xyz{|}~   */
        !           287:
        !           288:    UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /*   80 .. 87   */
        !           289:    UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /*   88 .. 8F   */
        !           290:    UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /*   90 .. 97   */
        !           291:    UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /*   98 .. 9F   */
        !           292:    UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /*   A0 .. A7   */
        !           293:    UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /*   A8 .. AF   */
        !           294:    UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /*   B0 .. B7   */
        !           295:    UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN,UCONTN, /*   B8 .. BF   */
        !           296:    NA,    NA,    U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N,  /*   C0 .. C7   */
        !           297:    U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N,  /*   C8 .. CF   */
        !           298:    U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N,  /*   D0 .. D7   */
        !           299:    U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N, U2_1N,  /*   D8 .. DF   */
        !           300:    U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N,  /*   E0 .. E7   */
        !           301:    U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N, U3_1N,  /*   E8 .. EF   */
        !           302:    U4_1N, U4_1N, U4_1N, U4_1N, U4_1N, NA,    NA,    NA,     /*   F0 .. F7   */
        !           303:    NA,    NA,    NA,    NA,    NA,    NA,    NA,    NA,     /*   F8 .. FF   */
        !           304: };
        !           305:
        !           306: #define SETLOCALE       2       /* #pragma setlocale (not __setlocale)  */
        !           307:
        !           308: #define NUM_ENCODING    8
        !           309: #define NUM_ALIAS       6
        !           310:
        !           311: /* Names of encoding recognized.  Table for search_encoding().  */
        !           312: static const char * const   encoding_name[ NUM_ENCODING][ NUM_ALIAS] = {
        !           313:     /* Visual C full, Visual C short
        !           314:         , 4 miscellaneous  */
        !           315:     { "english",    "c"
        !           316:         , "c",      "en",   "latin",    "iso8859"},
        !           317:     { "",     ""
        !           318:         , "eucjp",  "euc",  "ujis",     ""},
        !           319:     { "chinesesimplified",  "chs"
        !           320:         , "gb2312", "cngb",     "euccn",    ""},
        !           321:     { "korean",   "kor"
        !           322:         , "ksc5601",    "ksx1001",  "wansung",  "euckr"},
        !           323:     { "japanese", "jpn"
        !           324:         , "sjis",   "shiftjis", "mskanji",  ""},
        !           325:     { "chinesetraditional", "cht"
        !           326:         , "bigfive",    "big5", "cnbig5",   "euctw"},
        !           327:     { "",     ""
        !           328:         , "iso2022jp",  "iso2022jp1",   "jis",  ""},
        !           329:     { "",     ""
        !           330:         , "utf8",   "utf",      "",     ""},
        !           331: };
        !           332:
        !           333: static int      mbstart;
        !           334: static int      mb2;
        !           335:
        !           336: static size_t   mb_read_2byte( int c1, char ** in_pp, char ** out_pp);
        !           337:                 /* For 2-byte encodings of mbchar   */
        !           338: static const char *     search_encoding( char * norm, int alias);
        !           339:                 /* Search encoding_name[][] table   */
        !           340: static void     strip_bar( char * string);
        !           341:                 /* Remove '_', '-' or '.' in the string */
        !           342: static void     conv_case( char * name, char * lim, int upper);
        !           343:                 /* Convert to upper/lower case      */
        !           344: static size_t   mb_read_iso2022_jp( int c1, char ** in_pp, char ** out_pp);
        !           345:                 /* For ISO2022_JP encoding          */
        !           346: static size_t   mb_read_utf8( int c1, char ** in_pp, char ** out_pp);
        !           347:                 /* For UTF8 mbchar encoding         */
        !           348:
        !           349: #define NAMLEN          20
        !           350: #define UPPER           1               /* To upper */
        !           351: #define LOWER           0               /* To lower */
        !           352:
        !           353:
        !           354: const char *    set_encoding(
        !           355:     char *  name,       /* Name of encoding specified   */
        !           356:     char *  env,        /* Name of environment variable */
        !           357:     int     pragma
        !           358:         /* 2: #pragma setlocale, 1: #pragma __setlocale, 0: not #pragma */
        !           359: )
        !           360: /*
        !           361:  * Search the encoding specified and re-initialize mbchar settings.
        !           362:  */
        !           363: {
        !           364:     const char *    unknown_encoding
        !           365:             = "Unknown encoding: %s%.0ld%.0s";          /* _W1_ */
        !           366:     const char *    too_long
        !           367:             = "Too long encoding name: %s%.0ld%.0s";    /* _E_  */
        !           368:     const char *    loc = "";
        !           369:     int     alias;
        !           370:     char    norm[ NAMLEN];
        !           371:             /*
        !           372:              * Normalized name (removed 'xxxxx.', stripped '_', '-', '.'
        !           373:              * and lowered.
        !           374:              */
        !           375:
        !           376:     if (strlen( name) >= NAMLEN) {
        !           377:         if ((env || pragma) && (warn_level & 1)) {
        !           378:             cwarn( too_long, name, 0L, NULL);
        !           379:         } else {
        !           380:             mcpp_fprintf( ERR, too_long, name);
        !           381:             mcpp_fputc( '\n', ERR);
        !           382:         }
        !           383:     }
        !           384:     strcpy( norm, name);
        !           385:     if (norm[ 5] == '.')
        !           386:         memmove( norm, norm + 5, strlen( norm + 5) + 1);
        !           387:         /* Remove initial 'xxxxx.' as 'ja_JP.', 'en_US.' or any other   */
        !           388:     conv_case( norm, norm + strlen( norm), LOWER);
        !           389:     strip_bar( norm);
        !           390:
        !           391:     if (strlen( name) == 0) {                       /* ""       */
        !           392:         mbchar = MBCHAR;    /* Restore to the default encoding  */
        !           393:     } else if (memcmp( norm, "iso8859", 7) == 0     /* iso8859* */
        !           394:             || memcmp( norm, "latin", 5) == 0       /* latin*   */
        !           395:             || memcmp( norm, "en", 2) == 0) {       /* en*      */
        !           396:         mbchar = 0;                 /* No multi-byte character  */
        !           397:     } else {
        !           398:         alias = 2;
        !           399: #if COMPILER == MSC
        !           400:         if (pragma == SETLOCALE)        /* #pragma setlocale    */
        !           401:             alias = 0;
        !           402: #endif
        !           403:         loc = search_encoding( norm, alias);        /* Search the name  */
        !           404:     }
        !           405:     if (loc == NULL) {
        !           406:         if ((env || pragma) && (warn_level & 1)) {
        !           407:             cwarn( unknown_encoding, name, 0L, NULL);
        !           408:         } else {                        /* -m option            */
        !           409:             mcpp_fprintf( ERR, unknown_encoding, name);
        !           410:             mcpp_fputc( '\n', ERR);
        !           411:         }
        !           412:     } else {
        !           413:         mb_init();                      /* Re-initialize        */
        !           414:     }
        !           415:     return  loc;
        !           416: }
        !           417:
        !           418: static const char * search_encoding(
        !           419:     char *  norm,           /* The name of encoding specified   */
        !           420:     int     alias           /* The number of alias to start searching   */
        !           421: )
        !           422: {
        !           423:     const char *    loc;
        !           424:     int             lo, al;
        !           425:
        !           426:     for (lo = 0; lo < NUM_ENCODING; lo++) {
        !           427:         for (al = alias ; al < NUM_ALIAS; al++) {
        !           428:             loc = encoding_name[ lo][ al];
        !           429:             if (str_eq( loc, norm)) {
        !           430:                 switch (lo) {
        !           431:                 case 0  :   mbchar = 0;             break;
        !           432:                 case 1  :   mbchar = EUC_JP;        break;
        !           433:                 case 2  :   mbchar = GB2312;        break;
        !           434:                 case 3  :   mbchar = KSC5601;       break;
        !           435:                 case 4  :   mbchar = SJIS;          break;
        !           436:                 case 5  :   mbchar = BIGFIVE;       break;
        !           437:                 case 6  :   mbchar = ISO2022_JP;    break;
        !           438:                 case 7  :   mbchar = UTF8;          break;
        !           439:                 }
        !           440:                 return  loc;
        !           441:             }
        !           442:         }
        !           443:     }
        !           444:     return  NULL;
        !           445: }
        !           446:
        !           447: static void strip_bar(
        !           448:     char *  string
        !           449: )
        !           450: /*
        !           451:  * Strip '_', '-' or '.' in the string.
        !           452:  */
        !           453: {
        !           454:     char *  cp = string;
        !           455:
        !           456:     while (*cp != EOS) {
        !           457:         if (*cp == '_' || *cp == '-' || *cp == '.')
        !           458:             memmove( cp, cp + 1, strlen( cp));
        !           459:         else
        !           460:             cp++;
        !           461:     }
        !           462: }
        !           463:
        !           464: static void     conv_case(
        !           465:     char *  name,                       /* (diretory) Name          */
        !           466:     char *  lim,                        /* End of (directory) name  */
        !           467:     int     upper                       /* TRUE if to upper         */
        !           468: )
        !           469: /* Convert a string to upper-case letters or lower-case letters in-place    */
        !           470: {
        !           471:     int     c;
        !           472:     char *  sp;
        !           473:
        !           474:     for (sp = name; sp < lim; sp++) {
        !           475:         c = *sp & UCHARMAX;
        !           476: #if MBCHAR
        !           477:         if ((char_type[ c] & mbstart)) {
        !           478:             char    tmp[ PATHMAX+1];
        !           479:             char *  tp = tmp;
        !           480:             *tp++ = *sp++;
        !           481:             mb_read( c, &sp, &tp);
        !           482:         } else
        !           483: #endif
        !           484:         {
        !           485:             if (upper)
        !           486:                 *sp = toupper( c);
        !           487:             else
        !           488:                 *sp = tolower( c);
        !           489:         }
        !           490:     }
        !           491: }
        !           492:
        !           493: void    mb_init( void)
        !           494: /*
        !           495:  * Initialize multi-byte character settings.
        !           496:  * First called prior to setting the 'mcpp_mode'.
        !           497:  * Will be called again each time the multibyte character encoding is changed.
        !           498:  */
        !           499: {
        !           500:     /*
        !           501:      * Select the character classification table, select the multi-byte
        !           502:      * character reading routine and decide whether multi-byte character
        !           503:      * may contain the byte of value 0x5c.
        !           504:      */
        !           505:     switch (mbchar) {
        !           506:     case 0      :
        !           507:     case EUC_JP     :
        !           508:     case GB2312     :
        !           509:     case KSC5601    :
        !           510:         char_type = type_euc;
        !           511:         bsl_in_mbchar = FALSE;
        !           512:         mb_read = mb_read_2byte;
        !           513:         break;
        !           514:     case SJIS   :
        !           515:     case BIGFIVE    :
        !           516:         char_type = type_bsl;
        !           517:         bsl_in_mbchar = TRUE;
        !           518:         mb_read = mb_read_2byte;
        !           519:         break;
        !           520:     case ISO2022_JP :
        !           521:         char_type = type_iso2022_jp;
        !           522:         bsl_in_mbchar = TRUE;
        !           523:         mb_read = mb_read_iso2022_jp;
        !           524:         break;
        !           525:     case UTF8   :
        !           526:         char_type = type_utf8;
        !           527:         bsl_in_mbchar = FALSE;
        !           528:         mb_read = mb_read_utf8;
        !           529:         break;
        !           530:     }
        !           531:
        !           532:     /* Set the bit patterns for character classification.   */
        !           533:     switch (mbchar) {
        !           534:     case 0      :
        !           535:         mbstart = 0;
        !           536:         break;
        !           537:     case EUC_JP :
        !           538:         mbstart = EJ1;
        !           539:         mb2 = EJ2;
        !           540:         break;
        !           541:     case GB2312 :
        !           542:         mbstart = GB1;
        !           543:         mb2 = GB2;
        !           544:         break;
        !           545:     case KSC5601:
        !           546:         mbstart = KS1;
        !           547:         mb2 = KS2;
        !           548:         break;
        !           549:     case SJIS   :
        !           550:         mbstart = SJ1;
        !           551:         mb2 = SJ2;
        !           552:         break;
        !           553:     case BIGFIVE:
        !           554:         mbstart = BF1;
        !           555:         mb2 = BF2;
        !           556:         break;
        !           557:     case ISO2022_JP :
        !           558:         mbstart = IS1;
        !           559:         break;
        !           560:     case UTF8   :
        !           561:         mbstart = (U2_1 | U3_1 | U4_1);
        !           562:         break;
        !           563:     }
        !           564:     switch (mbchar) {
        !           565:     case 0      :
        !           566:         mbchk = 0;
        !           567:         break;
        !           568:     case EUC_JP :
        !           569:     case GB2312 :
        !           570:     case KSC5601:
        !           571:     case SJIS   :
        !           572:     case BIGFIVE:
        !           573:     case UTF8   :
        !           574:         mbchk = NA;
        !           575:         break;
        !           576:     case ISO2022_JP :
        !           577:         mbchk = (IS1 | NA);
        !           578:         break;
        !           579:     }
        !           580:
        !           581:     /*
        !           582:      * Set special handling for some encodings to supplement some compiler's
        !           583:      * deficiency.
        !           584:      */
        !           585:     switch (mbchar) {
        !           586:     case SJIS   :
        !           587: #if ! SJIS_IS_ESCAPE_FREE
        !           588:         bsl_need_escape = TRUE;
        !           589: #endif
        !           590:         break;
        !           591:     case BIGFIVE:
        !           592: #if ! BIGFIVE_IS_ESCAPE_FREE
        !           593:         bsl_need_escape = TRUE;
        !           594: #endif
        !           595:         break;
        !           596:     case ISO2022_JP :
        !           597: #if ! ISO2022_JP_IS_ESCAPE_FREE
        !           598:         bsl_need_escape = TRUE;
        !           599: #endif
        !           600:         break;
        !           601:     default :
        !           602:         bsl_need_escape = FALSE;
        !           603:         break;
        !           604:     }
        !           605:
        !           606:     /*
        !           607:      * Modify magic characters in character type table.
        !           608:      * char_type[] table should be rewritten in accordance with the 'mcpp_mode'
        !           609:      * whenever the encoding is changed.
        !           610:      */
        !           611:     if (mcpp_mode) {                /* If mcpp_mode is already set  */
        !           612:         char_type[ DEF_MAGIC] = standard ? LET : 0;
        !           613:         char_type[ IN_SRC] = (mcpp_mode == STD) ? LET : 0;
        !           614:         char_type[ TOK_SEP] = (mcpp_mode == STD || mcpp_mode == OLD_PREP)
        !           615:                 ? HSPA: 0;          /* TOK_SEP equals to COM_SEP    */
        !           616:     }
        !           617: }
        !           618:
        !           619: static size_t   mb_read_2byte(
        !           620:     int     c1,         /* The 1st byte of mbchar sequence (already read)   */
        !           621:     char ** in_pp,              /* Pointer to input     */
        !           622:     char ** out_pp              /* Pointer to output    */
        !           623: )
        !           624: /*
        !           625:  * Multi-byte character reading routine for 2-byte encodings.
        !           626:  */
        !           627: {
        !           628:     int     error = FALSE;
        !           629:     size_t  len = 0;    /* Number of multi-byte characters read.    */
        !           630:     char *  in_p = *in_pp;
        !           631:     char *  out_p = *out_pp;
        !           632:
        !           633:     if (! (char_type[ c1 & UCHARMAX] & mbstart))
        !           634:         return  MB_ERROR;           /* Not a multi-byte character   */
        !           635:
        !           636:     do {
        !           637:         if (! (char_type[ (*out_p++ = *in_p++) & UCHARMAX] & mb2)) {
        !           638:             error = TRUE;
        !           639:             break;
        !           640:         }
        !           641:         len++;
        !           642:     } while (char_type[ (*out_p++ = *in_p++) & UCHARMAX] & mbstart);
        !           643:     *in_pp = --in_p;
        !           644:     *(--out_p) = EOS;
        !           645:     *out_pp = out_p;
        !           646:     return  error ? (len | MB_ERROR) : len;
        !           647: }
        !           648:
        !           649: static size_t   mb_read_iso2022_jp(
        !           650:     int     c1, /* The 1st byte of the sequence already read (always 0x1b). */
        !           651:     char ** in_pp,
        !           652:     char ** out_pp
        !           653: )
        !           654: /*
        !           655:  * Multi-byte character reading routine for ISO2022_JP.
        !           656:  */
        !           657: {
        !           658:     int     error = FALSE;
        !           659:     size_t  len = 0;
        !           660:     char *  in_p = *in_pp;
        !           661:     char *  out_p = *out_pp;
        !           662:     int     c2, c3, c4;
        !           663:
        !           664:     if (! (char_type[ c1 & UCHARMAX] & mbstart))
        !           665:         return  MB_ERROR;
        !           666:
        !           667:     do {
        !           668:
        !           669:         *out_p++ = c2 = *in_p++;
        !           670:         if (! (char_type[ c2 & UCHARMAX] & IS2)) {
        !           671:             error = TRUE;
        !           672:             break;
        !           673:         }
        !           674:         *out_p++ = c3 = *in_p++;
        !           675:         if (! (char_type[ c3 & UCHARMAX] & IS3)) {
        !           676:             error = TRUE;
        !           677:             break;
        !           678:         }
        !           679:
        !           680:         switch (c2) {
        !           681:         case 0x24   :
        !           682:             switch (c3) {
        !           683:             case 0x42   :   /* 0x1b 0x24 0x42:  JIS X 0208-1983 */
        !           684:                 break;
        !           685:             case 0x28   :
        !           686:                 *out_p++ = c4 = *in_p++;
        !           687:                 if (! (char_type[ c4 & UCHARMAX] & IS4))
        !           688:                     error = TRUE;
        !           689:                 /* else:    0x1b 0x24 0x28 0x44:    JIS X 0212  */
        !           690:                 break;
        !           691:             default :
        !           692:                 error = TRUE;
        !           693:             }
        !           694:             break;
        !           695:         case 0x28   :
        !           696:             switch (c3) {
        !           697:             case 0x42   :   /* 0x1b 0x28 0x42:  ASCII   */
        !           698:                 c1 = *out_p++ = *in_p++ & UCHARMAX;
        !           699:                 continue;
        !           700:             default :
        !           701:                 error = TRUE;
        !           702:             }
        !           703:             break;
        !           704:         }
        !           705:         if (error)
        !           706:             break;
        !           707:
        !           708:         while (char_type[ c1 = *out_p++ = (*in_p++ & UCHARMAX)] & IJP) {
        !           709:             if (! (char_type[ *out_p++ = (*in_p++ & UCHARMAX)] & IJP)) {
        !           710:                 error = TRUE;
        !           711:                 break;
        !           712:             }
        !           713:             len++;          /* String of multi-byte characters  */
        !           714:         }
        !           715:         if (error)
        !           716:             break;
        !           717:
        !           718:     } while (char_type[ c1] & IS1);     /* 0x1b:    start of shift-sequence */
        !           719:
        !           720:     *in_pp = --in_p;
        !           721:     *(--out_p) = EOS;
        !           722:     *out_pp = out_p;
        !           723:     return  error ? (len | MB_ERROR) : len;
        !           724: }
        !           725:
        !           726: static size_t   mb_read_utf8(
        !           727:     int     c1,
        !           728:     char ** in_pp,
        !           729:     char ** out_pp
        !           730: )
        !           731: /*
        !           732:  * Multi-byte character reading routine for UTF8.
        !           733:  */
        !           734: {
        !           735:     int     error = FALSE;
        !           736:     size_t  len = 0;
        !           737:     char *  in_p = *in_pp;
        !           738:     char *  out_p = *out_pp;
        !           739:
        !           740:     if (! (char_type[ c1 & UCHARMAX] & mbstart))
        !           741:         return  MB_ERROR;
        !           742:
        !           743:     do {
        !           744:         unsigned int    codepoint;
        !           745:         int             i, bytes;
        !           746:
        !           747:         if ((char_type[ c1 & UCHARMAX] & U4_1) == U4_1)
        !           748:             bytes = 4;                          /* 4-byte character */
        !           749:         else if ((char_type[ c1 & UCHARMAX] & U3_1) == U3_1)
        !           750:             bytes = 3;                          /* 3-byte character */
        !           751:         else if ((char_type[ c1 & UCHARMAX] & U2_1) == U2_1)
        !           752:             bytes = 2;                          /* 2-byte character */
        !           753:
        !           754:         /* Must ensure that the sequence is not reserved as a surrogate */
        !           755:         codepoint = ((2 << (6-bytes)) - 1) & c1;    /* mask off top bits    */
        !           756:
        !           757:         /* All bytes left in the sequence must be in 0x80 - 0xBF    */
        !           758:         for (i = bytes - 1; i && !error; i--) {
        !           759:             codepoint = (codepoint << 6) + ((*in_p) & 0x3fU);
        !           760:             if (! (char_type[ (*out_p++ = *in_p++) & UCHARMAX] & UCONT))
        !           761:                 error = TRUE;
        !           762:         }
        !           763:
        !           764:         /* Check for overlong/underlong sequences */
        !           765:         if ((bytes == 2 && (codepoint < 0x80 || codepoint > 0x7FF))
        !           766:             || (bytes == 3 && (codepoint < 0x800 || codepoint > 0xFFFF))
        !           767:             || (bytes == 4 && (codepoint < 0x10000 || codepoint > 0x10FFFF)))
        !           768:             error = TRUE;
        !           769:         if ((codepoint >= 0xD800 && codepoint <= 0xDFFF)
        !           770:             /* Check for reserved surrogate codepoints */
        !           771:                 || (codepoint >= 0xFFFE && codepoint <= 0xFFFF))
        !           772:                 /* Illegal  */
        !           773:             error = TRUE;
        !           774: #if 0
        !           775:         printf( "codepoint:0x%x\n", codepoint);
        !           776: #endif
        !           777:         if (error)
        !           778:             break;
        !           779:         len++;
        !           780:     } while (char_type[ (*out_p++ = c1 = *in_p++) & UCHARMAX] & mbstart);
        !           781:                         /* Start of the next multi-byte character   */
        !           782:     *in_pp = --in_p;
        !           783:     *(--out_p) = EOS;
        !           784:     *out_pp = out_p;
        !           785:     return  error ? (len | MB_ERROR) : len;
        !           786: }
        !           787:
        !           788: uexpr_t     mb_eval(
        !           789:     char ** seq_pp
        !           790: )
        !           791: /*
        !           792:  * Evaluate the value of a multi-byte character.
        !           793:  * This routine does not check the legality of the sequence.
        !           794:  * This routine is called from eval_char().
        !           795:  * This routine is never called in POST_STD mode.
        !           796:  */
        !           797: {
        !           798:     char *      seq = *seq_pp;
        !           799:     uexpr_t     val = 0;
        !           800:     int         c, c1;
        !           801:
        !           802:     if (! (char_type[ c = *seq++ & UCHARMAX] & mbstart)) {
        !           803:         *seq_pp = seq;
        !           804:         return  c;                  /* Not a multi-byte character   */
        !           805:     }
        !           806:
        !           807:     switch (mbchar) {
        !           808:     case EUC_JP :
        !           809:     case GB2312 :
        !           810:     case KSC5601:
        !           811:     case SJIS   :
        !           812:     case BIGFIVE:
        !           813:         val = (c << 8) + (*seq++ & UCHARMAX);
        !           814:         /* Evaluate the 2-byte sequence */
        !           815:         break;
        !           816:     case ISO2022_JP :
        !           817:         if (char_type[ c & UCHARMAX] & IS1) {   /* Skip shift-sequence  */
        !           818:             if (char_type[ c = *seq++ & UCHARMAX] & IS2) {
        !           819:                 if (char_type[ c1 = *seq++ & UCHARMAX] & IS3) {
        !           820:                     if (c1 == 0x28)
        !           821:                         seq++;
        !           822:                     if (c == 0x28 && c1 == 0x42) {  /* Shift-out sequence   */
        !           823:                         val = 0;
        !           824:                         break;
        !           825:                     }
        !           826:                     c = *seq++ & UCHARMAX;
        !           827:                 }
        !           828:             }
        !           829:         }
        !           830:         val = (c << 8) + (*seq++ & UCHARMAX);       /* Evaluate the 2-bytes */
        !           831:         break;
        !           832:     case UTF8   :   /* Evaluate the sequence of 2, 3 or 4 bytes as it is    */
        !           833:         val = (c << 8) + (*seq++ & UCHARMAX);
        !           834:         if (char_type[ c & UCHARMAX] & U3_1) {
        !           835:             val = (val << 8) + (*seq++ & UCHARMAX);
        !           836:         } else if (char_type[ c & UCHARMAX] & U4_1) {
        !           837:             val = (val << 8) + (*seq++ & UCHARMAX);
        !           838:             val = (val << 8) + (*seq++ & UCHARMAX);
        !           839:         }
        !           840:         break;
        !           841:     }
        !           842:
        !           843:     *seq_pp = seq;
        !           844:     return  val;
        !           845: }
        !           846:
        !           847: int  last_is_mbchar(
        !           848:     const char *  in,               /* Input physical line          */
        !           849:     int     len                     /* Length of the line minus 2   */
        !           850: )
        !           851: /*
        !           852:  * Return 2, if the last char of the line is second byte of SJIS or BIGFIVE,
        !           853:  * else return 0.
        !           854:  */
        !           855: {
        !           856:     const char *    cp = in + len;
        !           857:     const char * const      endp = in + len;    /* -> the char befor '\n'   */
        !           858:
        !           859:     if ((mbchar & (SJIS | BIGFIVE)) == 0)
        !           860:         return  0;
        !           861:     while (in <= --cp) {                    /* Search backwardly    */
        !           862:         if ((char_type[ *cp & UCHARMAX] & mbstart) == 0)
        !           863:             break;                  /* Not the first byte of MBCHAR */
        !           864:     }
        !           865:     if ((endp - cp) & 1)
        !           866:         return  0;
        !           867:     else
        !           868:         return  2;
        !           869: }
        !           870:
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>