[BACK]Return to popham.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / pentium4 / mmx

Annotation of OpenXM_contrib/gmp/mpn/x86/pentium4/mmx/popham.asm, Revision 1.1.1.1

1.1       ohara       1: dnl  Intel Pentium 4 mpn_popcount, mpn_hamdist -- population count and
                      2: dnl  hamming distance.
                      3:
                      4: dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
                      5: dnl
                      6: dnl  This file is part of the GNU MP Library.
                      7: dnl
                      8: dnl  The GNU MP Library is free software; you can redistribute it and/or
                      9: dnl  modify it under the terms of the GNU Lesser General Public License as
                     10: dnl  published by the Free Software Foundation; either version 2.1 of the
                     11: dnl  License, or (at your option) any later version.
                     12: dnl
                     13: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     14: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     15: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     16: dnl  Lesser General Public License for more details.
                     17: dnl
                     18: dnl  You should have received a copy of the GNU Lesser General Public
                     19: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     20: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     21: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     22:
                     23: include(`../config.m4')
                     24:
                     25:
                     26: C P4: popcount 8.5 cycles/limb
                     27: C     hamdist  9.5 cycles/limb
                     28:
                     29:
                     30: C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
                     31: C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
                     32: C
                     33: C Loading with unaligned movq's costs an extra 1 c/l and hence is avoided.
                     34: C Two movd's and a punpckldq seems to be the same speed as an aligned movq,
                     35: C and using them saves fiddling about with alignment testing on entry.
                     36: C
                     37: C For popcount there's 13 mmx instructions in the loop, so perhaps 6.5 c/l
                     38: C might be possible, but 8.5 c/l relying on out-of-order execution is
                     39: C already quite reasonable.
                     40:
                     41: ifdef(`OPERATION_popcount',,
                     42: `ifdef(`OPERATION_hamdist',,
                     43: `m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
                     44: ')')')
                     45:
                     46: define(HAM,
                     47: m4_assert_numargs(1)
                     48: `ifdef(`OPERATION_hamdist',`$1')')
                     49:
                     50: define(POP,
                     51: m4_assert_numargs(1)
                     52: `ifdef(`OPERATION_popcount',`$1')')
                     53:
                     54: HAM(`
                     55: defframe(PARAM_SIZE, 12)
                     56: defframe(PARAM_SRC2,  8)
                     57: defframe(PARAM_SRC,   4)
                     58: define(M4_function,mpn_hamdist)
                     59: ')
                     60: POP(`
                     61: defframe(PARAM_SIZE,  8)
                     62: defframe(PARAM_SRC,   4)
                     63: define(M4_function,mpn_popcount)
                     64: ')
                     65:
                     66: MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
                     67:
                     68:
                     69: ifdef(`PIC',,`
                     70:        dnl  non-PIC
                     71:        RODATA
                     72:        ALIGN(8)
                     73: L(rodata_AAAAAAAAAAAAAAAA):
                     74:        .long   0xAAAAAAAA
                     75:        .long   0xAAAAAAAA
                     76: L(rodata_3333333333333333):
                     77:        .long   0x33333333
                     78:        .long   0x33333333
                     79: L(rodata_0F0F0F0F0F0F0F0F):
                     80:        .long   0x0F0F0F0F
                     81:        .long   0x0F0F0F0F
                     82: ')
                     83:
                     84:        TEXT
                     85:        ALIGN(16)
                     86:
                     87: PROLOGUE(M4_function)
                     88: deflit(`FRAME',0)
                     89:
                     90:        movl    PARAM_SIZE, %ecx
                     91:        movl    PARAM_SRC, %eax
                     92:
                     93: ifdef(`PIC',`
                     94:        movl    $0xAAAAAAAA, %edx
                     95:        movd    %edx, %mm7
                     96:        punpckldq %mm7, %mm7
                     97:
                     98:        movl    $0x33333333, %edx
                     99:        movd    %edx, %mm6
                    100:        punpckldq %mm6, %mm6
                    101:
                    102:        movl    $0x0F0F0F0F, %edx
                    103:        movd    %edx, %mm5
                    104:        punpckldq %mm5, %mm5
                    105:
                    106: HAM(`  movl    PARAM_SRC2, %edx')
                    107:
                    108: ',`
                    109:        dnl non-PIC
                    110: HAM(`  movl    PARAM_SRC2, %edx')
                    111:        movq    L(rodata_AAAAAAAAAAAAAAAA), %mm7
                    112:        movq    L(rodata_3333333333333333), %mm6
                    113:        movq    L(rodata_0F0F0F0F0F0F0F0F), %mm5
                    114: ')
                    115:
                    116:        pxor    %mm4, %mm4              C zero
                    117:        pxor    %mm0, %mm0              C total
                    118:
                    119:        subl    $1, %ecx
                    120:        ja      L(top)
                    121:
                    122: L(last):
                    123:        movd    (%eax,%ecx,4), %mm1             C src high limb
                    124: HAM(`  movd    (%edx,%ecx,4), %mm2
                    125:        pxor    %mm2, %mm1
                    126: ')
                    127:        jmp     L(loaded)
                    128:
                    129:
                    130: L(top):
                    131:        C eax   src
                    132:        C ebx
                    133:        C ecx   counter, size-1 to 2 or 1, inclusive
                    134:        C edx   [hamdist] src2
                    135:        C
                    136:        C mm0   total (low dword)
                    137:        C mm1   (scratch)
                    138:        C mm2   (scratch)
                    139:        C mm3
                    140:        C mm4   0x0000000000000000
                    141:        C mm5   0x0F0F0F0F0F0F0F0F
                    142:        C mm6   0x3333333333333333
                    143:        C mm7   0xAAAAAAAAAAAAAAAA
                    144:
                    145:        movd    (%eax), %mm1
                    146:        movd    4(%eax), %mm2
                    147:        punpckldq %mm2, %mm1
                    148:        addl    $8, %eax
                    149:
                    150: HAM(`  movd    (%edx), %mm2
                    151:        movd    4(%edx), %mm3
                    152:        punpckldq %mm3, %mm2
                    153:        pxor    %mm2, %mm1
                    154:        addl    $8, %edx
                    155: ')
                    156:
                    157: L(loaded):
                    158:        movq    %mm7, %mm2
                    159:        pand    %mm1, %mm2
                    160:        psrlq   $1, %mm2
                    161:        psubd   %mm2, %mm1      C bit pairs
                    162:
                    163:        movq    %mm6, %mm2
                    164:        pand    %mm1, %mm2
                    165:        psrlq   $2, %mm1
                    166:        pand    %mm6, %mm1
                    167:        paddd   %mm2, %mm1      C nibbles
                    168:
                    169:        movq    %mm5, %mm2
                    170:        pand    %mm1, %mm2
                    171:        psrlq   $4, %mm1
                    172:        pand    %mm5, %mm1
                    173:        paddd   %mm2, %mm1      C bytes
                    174:
                    175:        psadbw( %mm4, %mm1)
                    176:        paddd   %mm1, %mm0      C to total
                    177:
                    178:        subl    $2, %ecx
                    179:        jg      L(top)
                    180:
                    181:        C ecx is 0 or -1 representing respectively 1 or 0 further limbs
                    182:        jz      L(last)
                    183:
                    184:
                    185:        movd    %mm0, %eax
                    186:        emms
                    187:        ret
                    188:
                    189: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>