[BACK]Return to popham.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k6 / mmx

Annotation of OpenXM_contrib/gmp/mpn/x86/k6/mmx/popham.asm, Revision 1.1.1.3

1.1       maekawa     1: dnl  AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and
                      2: dnl  hamming distance.
                      3:
1.1.1.3 ! ohara       4: dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
1.1       maekawa     5: dnl
                      6: dnl  This file is part of the GNU MP Library.
                      7: dnl
                      8: dnl  The GNU MP Library is free software; you can redistribute it and/or
                      9: dnl  modify it under the terms of the GNU Lesser General Public License as
                     10: dnl  published by the Free Software Foundation; either version 2.1 of the
                     11: dnl  License, or (at your option) any later version.
                     12: dnl
                     13: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     14: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     15: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     16: dnl  Lesser General Public License for more details.
                     17: dnl
                     18: dnl  You should have received a copy of the GNU Lesser General Public
                     19: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     20: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     21: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     22:
                     23: include(`../config.m4')
                     24:
                     25:
1.1.1.3 ! ohara      26: C        popcount  hamdist
        !            27: C K6-2:    9.0       11.5   cycles/limb
        !            28: C K6:      12.5      13.0
        !            29:
        !            30:
1.1       maekawa    31: C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
                     32: C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
                     33: C
                     34: C The code here isn't optimal, but it's already a 2x speedup over the plain
                     35: C integer mpn/generic/popcount.c,hamdist.c.
                     36:
                     37:
                     38: ifdef(`OPERATION_popcount',,
                     39: `ifdef(`OPERATION_hamdist',,
                     40: `m4_error(`Need OPERATION_popcount or OPERATION_hamdist
                     41: ')m4exit(1)')')
                     42:
                     43: define(HAM,
                     44: m4_assert_numargs(1)
                     45: `ifdef(`OPERATION_hamdist',`$1')')
                     46:
                     47: define(POP,
                     48: m4_assert_numargs(1)
                     49: `ifdef(`OPERATION_popcount',`$1')')
                     50:
                     51: HAM(`
                     52: defframe(PARAM_SIZE,   12)
                     53: defframe(PARAM_SRC2,   8)
                     54: defframe(PARAM_SRC,    4)
                     55: define(M4_function,mpn_hamdist)
                     56: ')
                     57: POP(`
                     58: defframe(PARAM_SIZE,   8)
                     59: defframe(PARAM_SRC,    4)
                     60: define(M4_function,mpn_popcount)
                     61: ')
                     62:
                     63: MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
                     64:
                     65:
                     66: ifdef(`PIC',,`
                     67:        dnl  non-PIC
                     68:
1.1.1.3 ! ohara      69:        RODATA
1.1       maekawa    70:        ALIGN(8)
                     71:
1.1.1.3 ! ohara      72: L(rodata_AAAAAAAAAAAAAAAA):
1.1       maekawa    73:        .long   0xAAAAAAAA
                     74:        .long   0xAAAAAAAA
                     75:
1.1.1.3 ! ohara      76: L(rodata_3333333333333333):
1.1       maekawa    77:        .long   0x33333333
                     78:        .long   0x33333333
                     79:
1.1.1.3 ! ohara      80: L(rodata_0F0F0F0F0F0F0F0F):
1.1       maekawa    81:        .long   0x0F0F0F0F
                     82:        .long   0x0F0F0F0F
                     83:
1.1.1.3 ! ohara      84: L(rodata_000000FF000000FF):
1.1       maekawa    85:        .long   0x000000FF
                     86:        .long   0x000000FF
                     87: ')
                     88:
1.1.1.3 ! ohara      89:        TEXT
1.1       maekawa    90:        ALIGN(32)
                     91:
                     92: POP(`ifdef(`PIC', `
                     93:        C avoid shrl crossing a 32-byte boundary
                     94:        nop')')
                     95:
                     96: PROLOGUE(M4_function)
                     97: deflit(`FRAME',0)
                     98:
                     99:        movl    PARAM_SIZE, %ecx
                    100:
                    101: ifdef(`PIC',`
                    102:        movl    $0xAAAAAAAA, %eax
                    103:        movl    $0x33333333, %edx
                    104:
                    105:        movd    %eax, %mm7
                    106:        movd    %edx, %mm6
                    107:
                    108:        movl    $0x0F0F0F0F, %eax
                    109:        movl    $0x000000FF, %edx
                    110:
                    111:        punpckldq %mm7, %mm7
                    112:        punpckldq %mm6, %mm6
                    113:
                    114:        movd    %eax, %mm5
                    115:        movd    %edx, %mm4
                    116:
                    117:        punpckldq %mm5, %mm5
                    118:        punpckldq %mm4, %mm4
                    119: ',`
                    120:
1.1.1.3 ! ohara     121:        movq    L(rodata_AAAAAAAAAAAAAAAA), %mm7
        !           122:        movq    L(rodata_3333333333333333), %mm6
        !           123:        movq    L(rodata_0F0F0F0F0F0F0F0F), %mm5
        !           124:        movq    L(rodata_000000FF000000FF), %mm4
1.1       maekawa   125: ')
                    126:
                    127: define(REG_AAAAAAAAAAAAAAAA, %mm7)
                    128: define(REG_3333333333333333, %mm6)
                    129: define(REG_0F0F0F0F0F0F0F0F, %mm5)
                    130: define(REG_000000FF000000FF, %mm4)
                    131:
                    132:
                    133:        movl    PARAM_SRC, %eax
                    134: HAM(`  movl    PARAM_SRC2, %edx')
                    135:
                    136:        pxor    %mm2, %mm2      C total
                    137:
                    138:        shrl    %ecx
                    139:        jnc     L(top)
                    140:
                    141: Zdisp( movd,   0,(%eax,%ecx,8), %mm1)
                    142:
                    143: HAM(`
                    144: Zdisp( movd,   0,(%edx,%ecx,8), %mm0)
                    145:        pxor    %mm0, %mm1
                    146: ')
                    147:
                    148:        incl    %ecx
                    149:        jmp     L(loaded)
                    150:
                    151:
                    152:        ALIGN(16)
                    153: POP(`  nop     C alignment to avoid crossing 32-byte boundaries')
                    154:
                    155: L(top):
                    156:        C eax   src
                    157:        C ebx
                    158:        C ecx   counter, qwords, decrementing
                    159:        C edx   [hamdist] src2
                    160:        C
                    161:        C mm0   (scratch)
                    162:        C mm1   (scratch)
                    163:        C mm2   total (low dword)
                    164:        C mm3
                    165:        C mm4   \
                    166:        C mm5   | special constants
                    167:        C mm6   |
                    168:        C mm7   /
                    169:
                    170:        movq    -8(%eax,%ecx,8), %mm1
                    171: HAM(`  pxor    -8(%edx,%ecx,8), %mm1')
                    172:
                    173: L(loaded):
                    174:        movq    %mm1, %mm0
                    175:        pand    REG_AAAAAAAAAAAAAAAA, %mm1
                    176:
                    177:        psrlq   $1, %mm1
                    178: HAM(`  nop                     C code alignment')
                    179:
                    180:        psubd   %mm1, %mm0      C bit pairs
                    181: HAM(`  nop                     C code alignment')
                    182:
                    183:
                    184:        movq    %mm0, %mm1
                    185:        psrlq   $2, %mm0
                    186:
                    187:        pand    REG_3333333333333333, %mm0
                    188:        pand    REG_3333333333333333, %mm1
                    189:
                    190:        paddd   %mm1, %mm0      C nibbles
                    191:
                    192:
                    193:        movq    %mm0, %mm1
                    194:        psrlq   $4, %mm0
                    195:
                    196:        pand    REG_0F0F0F0F0F0F0F0F, %mm0
                    197:        pand    REG_0F0F0F0F0F0F0F0F, %mm1
                    198:
                    199:        paddd   %mm1, %mm0      C bytes
                    200:
                    201:        movq    %mm0, %mm1
                    202:        psrlq   $8, %mm0
                    203:
                    204:
                    205:        paddb   %mm1, %mm0      C words
                    206:
                    207:
                    208:        movq    %mm0, %mm1
                    209:        psrlq   $16, %mm0
                    210:
                    211:        paddd   %mm1, %mm0      C dwords
                    212:
                    213:        pand    REG_000000FF000000FF, %mm0
                    214:
                    215:        paddd   %mm0, %mm2      C low to total
                    216:        psrlq   $32, %mm0
                    217:
                    218:        paddd   %mm0, %mm2      C high to total
                    219:        loop    L(top)
                    220:
                    221:
                    222:
                    223:        movd    %mm2, %eax
                    224:        emms_or_femms
                    225:        ret
                    226:
                    227: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>