[BACK]Return to popham.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k6 / mmx

Annotation of OpenXM_contrib/gmp/mpn/x86/k6/mmx/popham.asm, Revision 1.1.1.2

1.1       maekawa     1: dnl  AMD K6-2 mpn_popcount, mpn_hamdist -- mpn bit population count and
                      2: dnl  hamming distance.
                      3: dnl
                      4: dnl         popcount  hamdist
                      5: dnl  K6-2:    9.0       11.5   cycles/limb
                      6: dnl  K6:      12.5      13.0
                      7:
                      8:
                      9: dnl  Copyright (C) 2000 Free Software Foundation, Inc.
                     10: dnl
                     11: dnl  This file is part of the GNU MP Library.
                     12: dnl
                     13: dnl  The GNU MP Library is free software; you can redistribute it and/or
                     14: dnl  modify it under the terms of the GNU Lesser General Public License as
                     15: dnl  published by the Free Software Foundation; either version 2.1 of the
                     16: dnl  License, or (at your option) any later version.
                     17: dnl
                     18: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     19: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     20: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     21: dnl  Lesser General Public License for more details.
                     22: dnl
                     23: dnl  You should have received a copy of the GNU Lesser General Public
                     24: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     25: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     26: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     27:
                     28:
                     29: include(`../config.m4')
                     30:
                     31:
                     32: C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
                     33: C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
                     34: C
                     35: C The code here isn't optimal, but it's already a 2x speedup over the plain
                     36: C integer mpn/generic/popcount.c,hamdist.c.
                     37:
                     38:
                     39: ifdef(`OPERATION_popcount',,
                     40: `ifdef(`OPERATION_hamdist',,
                     41: `m4_error(`Need OPERATION_popcount or OPERATION_hamdist
                     42: ')m4exit(1)')')
                     43:
                     44: define(HAM,
                     45: m4_assert_numargs(1)
                     46: `ifdef(`OPERATION_hamdist',`$1')')
                     47:
                     48: define(POP,
                     49: m4_assert_numargs(1)
                     50: `ifdef(`OPERATION_popcount',`$1')')
                     51:
                     52: HAM(`
                     53: defframe(PARAM_SIZE,   12)
                     54: defframe(PARAM_SRC2,   8)
                     55: defframe(PARAM_SRC,    4)
                     56: define(M4_function,mpn_hamdist)
                     57: ')
                     58: POP(`
                     59: defframe(PARAM_SIZE,   8)
                     60: defframe(PARAM_SRC,    4)
                     61: define(M4_function,mpn_popcount)
                     62: ')
                     63:
                     64: MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
                     65:
                     66:
                     67: ifdef(`PIC',,`
                     68:        dnl  non-PIC
                     69:
1.1.1.2 ! maekawa    70:        DATA
1.1       maekawa    71:        ALIGN(8)
                     72:
                     73: define(LS,
                     74: m4_assert_numargs(1)
                     75: `LF(M4_function,`$1')')
                     76:
                     77: LS(rodata_AAAAAAAAAAAAAAAA):
                     78:        .long   0xAAAAAAAA
                     79:        .long   0xAAAAAAAA
                     80:
                     81: LS(rodata_3333333333333333):
                     82:        .long   0x33333333
                     83:        .long   0x33333333
                     84:
                     85: LS(rodata_0F0F0F0F0F0F0F0F):
                     86:        .long   0x0F0F0F0F
                     87:        .long   0x0F0F0F0F
                     88:
                     89: LS(rodata_000000FF000000FF):
                     90:        .long   0x000000FF
                     91:        .long   0x000000FF
                     92: ')
                     93:
                     94:        .text
                     95:        ALIGN(32)
                     96:
                     97: POP(`ifdef(`PIC', `
                     98:        C avoid shrl crossing a 32-byte boundary
                     99:        nop')')
                    100:
                    101: PROLOGUE(M4_function)
                    102: deflit(`FRAME',0)
                    103:
                    104:        movl    PARAM_SIZE, %ecx
                    105:        orl     %ecx, %ecx
                    106:        jz      L(zero)
                    107:
                    108: ifdef(`PIC',`
                    109:        movl    $0xAAAAAAAA, %eax
                    110:        movl    $0x33333333, %edx
                    111:
                    112:        movd    %eax, %mm7
                    113:        movd    %edx, %mm6
                    114:
                    115:        movl    $0x0F0F0F0F, %eax
                    116:        movl    $0x000000FF, %edx
                    117:
                    118:        punpckldq %mm7, %mm7
                    119:        punpckldq %mm6, %mm6
                    120:
                    121:        movd    %eax, %mm5
                    122:        movd    %edx, %mm4
                    123:
                    124:        punpckldq %mm5, %mm5
                    125:        punpckldq %mm4, %mm4
                    126: ',`
                    127:
                    128:        movq    LS(rodata_AAAAAAAAAAAAAAAA), %mm7
                    129:        movq    LS(rodata_3333333333333333), %mm6
                    130:        movq    LS(rodata_0F0F0F0F0F0F0F0F), %mm5
                    131:        movq    LS(rodata_000000FF000000FF), %mm4
                    132: ')
                    133:
                    134: define(REG_AAAAAAAAAAAAAAAA, %mm7)
                    135: define(REG_3333333333333333, %mm6)
                    136: define(REG_0F0F0F0F0F0F0F0F, %mm5)
                    137: define(REG_000000FF000000FF, %mm4)
                    138:
                    139:
                    140:        movl    PARAM_SRC, %eax
                    141: HAM(`  movl    PARAM_SRC2, %edx')
                    142:
                    143:        pxor    %mm2, %mm2      C total
                    144:
                    145:        shrl    %ecx
                    146:        jnc     L(top)
                    147:
                    148: Zdisp( movd,   0,(%eax,%ecx,8), %mm1)
                    149:
                    150: HAM(`
                    151: Zdisp( movd,   0,(%edx,%ecx,8), %mm0)
                    152:        pxor    %mm0, %mm1
                    153: ')
                    154:
                    155:        incl    %ecx
                    156:        jmp     L(loaded)
                    157:
                    158:
                    159:        ALIGN(16)
                    160: POP(`  nop     C alignment to avoid crossing 32-byte boundaries')
                    161:
                    162: L(top):
                    163:        C eax   src
                    164:        C ebx
                    165:        C ecx   counter, qwords, decrementing
                    166:        C edx   [hamdist] src2
                    167:        C
                    168:        C mm0   (scratch)
                    169:        C mm1   (scratch)
                    170:        C mm2   total (low dword)
                    171:        C mm3
                    172:        C mm4   \
                    173:        C mm5   | special constants
                    174:        C mm6   |
                    175:        C mm7   /
                    176:
                    177:        movq    -8(%eax,%ecx,8), %mm1
                    178: HAM(`  pxor    -8(%edx,%ecx,8), %mm1')
                    179:
                    180: L(loaded):
                    181:        movq    %mm1, %mm0
                    182:        pand    REG_AAAAAAAAAAAAAAAA, %mm1
                    183:
                    184:        psrlq   $1, %mm1
                    185: HAM(`  nop                     C code alignment')
                    186:
                    187:        psubd   %mm1, %mm0      C bit pairs
                    188: HAM(`  nop                     C code alignment')
                    189:
                    190:
                    191:        movq    %mm0, %mm1
                    192:        psrlq   $2, %mm0
                    193:
                    194:        pand    REG_3333333333333333, %mm0
                    195:        pand    REG_3333333333333333, %mm1
                    196:
                    197:        paddd   %mm1, %mm0      C nibbles
                    198:
                    199:
                    200:        movq    %mm0, %mm1
                    201:        psrlq   $4, %mm0
                    202:
                    203:        pand    REG_0F0F0F0F0F0F0F0F, %mm0
                    204:        pand    REG_0F0F0F0F0F0F0F0F, %mm1
                    205:
                    206:        paddd   %mm1, %mm0      C bytes
                    207:
                    208:        movq    %mm0, %mm1
                    209:        psrlq   $8, %mm0
                    210:
                    211:
                    212:        paddb   %mm1, %mm0      C words
                    213:
                    214:
                    215:        movq    %mm0, %mm1
                    216:        psrlq   $16, %mm0
                    217:
                    218:        paddd   %mm1, %mm0      C dwords
                    219:
                    220:        pand    REG_000000FF000000FF, %mm0
                    221:
                    222:        paddd   %mm0, %mm2      C low to total
                    223:        psrlq   $32, %mm0
                    224:
                    225:        paddd   %mm0, %mm2      C high to total
                    226:        loop    L(top)
                    227:
                    228:
                    229:
                    230:        movd    %mm2, %eax
                    231:        emms_or_femms
                    232:        ret
                    233:
                    234: L(zero):
                    235:        movl    $0, %eax
                    236:        ret
                    237:
                    238: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>