[BACK]Return to popham.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k7 / mmx

Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mmx/popham.asm, Revision 1.1.1.2

1.1       maekawa     1: dnl  AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming
                      2: dnl  distance.
                      3: dnl
                      4: dnl  K7: popcount 5.0 cycles/limb, hamdist 6.0 cycles/limb
                      5:
                      6:
                      7: dnl  Copyright (C) 2000 Free Software Foundation, Inc.
                      8: dnl
                      9: dnl  This file is part of the GNU MP Library.
                     10: dnl
                     11: dnl  The GNU MP Library is free software; you can redistribute it and/or
                     12: dnl  modify it under the terms of the GNU Lesser General Public License as
                     13: dnl  published by the Free Software Foundation; either version 2.1 of the
                     14: dnl  License, or (at your option) any later version.
                     15: dnl
                     16: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     17: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     18: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     19: dnl  Lesser General Public License for more details.
                     20: dnl
                     21: dnl  You should have received a copy of the GNU Lesser General Public
                     22: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     23: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     24: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     25:
                     26:
                     27: include(`../config.m4')
                     28:
                     29:
                     30: dnl  Only recent versions of gas know psadbw, in particular gas 2.9.1 on
                     31: dnl  FreeBSD 3.3 and 3.4 doesn't recognise it.
                     32:
                     33: define(psadbw_mm4_mm0,
                     34: `ifelse(m4_ifdef_anyof_p(`HAVE_TARGET_CPU_athlon',
                     35:                          `HAVE_TARGET_CPU_pentium3'),1,
                     36:        `.byte 0x0f,0xf6,0xc4   C psadbw %mm4, %mm0',
                     37:
                     38: `m4_warning(`warning, using simulated and only partly functional psadbw, use for testing only
                     39: ')     C this works enough for the sum of bytes done below, making it
                     40:        C possible to test on an older cpu
                     41:        leal    -8(%esp), %esp
                     42:        movq    %mm4, (%esp)
                     43:        movq    %mm0, %mm4
                     44: forloop(i,1,7,
                     45: `      psrlq   $ 8, %mm4
                     46:        paddb   %mm4, %mm0
                     47: ')
                     48:        pushl   $ 0
                     49:        pushl   $ 0xFF
                     50:        pand    (%esp), %mm0
                     51:        movq    8(%esp), %mm4
                     52:        leal    16(%esp), %esp
                     53: ')')
                     54:
                     55:
                     56: C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
                     57: C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
                     58: C
                     59: C The code here is almost certainly not optimal, but is already a 3x speedup
                     60: C over the generic C code.  The main improvement would be to interleave
                     61: C processing of two qwords in the loop so as to fully exploit the available
                     62: C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs).
                     63: C
                     64: C The loop is based on the example "Efficient 64-bit population count using
                     65: C MMX instructions" in the Athlon Optimization Guide, AMD document 22007,
                     66: C page 158 of rev E (reference in mpn/x86/k7/README).
                     67:
                     68: ifdef(`OPERATION_popcount',,
                     69: `ifdef(`OPERATION_hamdist',,
                     70: `m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
                     71: ')')')
                     72:
                     73: define(HAM,
                     74: m4_assert_numargs(1)
                     75: `ifdef(`OPERATION_hamdist',`$1')')
                     76:
                     77: define(POP,
                     78: m4_assert_numargs(1)
                     79: `ifdef(`OPERATION_popcount',`$1')')
                     80:
                     81: HAM(`
                     82: defframe(PARAM_SIZE,   12)
                     83: defframe(PARAM_SRC2,   8)
                     84: defframe(PARAM_SRC,    4)
                     85: define(M4_function,mpn_hamdist)
                     86: ')
                     87: POP(`
                     88: defframe(PARAM_SIZE,   8)
                     89: defframe(PARAM_SRC,    4)
                     90: define(M4_function,mpn_popcount)
                     91: ')
                     92:
                     93: MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
                     94:
                     95:
                     96: ifdef(`PIC',,`
                     97:        dnl  non-PIC
                     98:
1.1.1.2 ! maekawa    99:        DATA
1.1       maekawa   100:        ALIGN(8)
                    101:
                    102: define(LS,
                    103: m4_assert_numargs(1)
                    104: `LF(M4_function,`$1')')
                    105:
                    106: LS(rodata_AAAAAAAAAAAAAAAA):
                    107:        .long   0xAAAAAAAA
                    108:        .long   0xAAAAAAAA
                    109:
                    110: LS(rodata_3333333333333333):
                    111:        .long   0x33333333
                    112:        .long   0x33333333
                    113:
                    114: LS(rodata_0F0F0F0F0F0F0F0F):
                    115:        .long   0x0F0F0F0F
                    116:        .long   0x0F0F0F0F
                    117: ')
                    118:
                    119:        .text
                    120:        ALIGN(32)
                    121:
                    122: PROLOGUE(M4_function)
                    123: deflit(`FRAME',0)
                    124:
                    125:        movl    PARAM_SIZE, %ecx
                    126:        orl     %ecx, %ecx
                    127:        jz      L(zero)
                    128:
                    129: ifdef(`PIC',`
                    130:        movl    $0xAAAAAAAA, %eax
                    131:        movl    $0x33333333, %edx
                    132:
                    133:        movd    %eax, %mm7
                    134:        movd    %edx, %mm6
                    135:
                    136:        movl    $0x0F0F0F0F, %eax
                    137:
                    138:        punpckldq %mm7, %mm7
                    139:        punpckldq %mm6, %mm6
                    140:
                    141:        movd    %eax, %mm5
                    142:        movd    %edx, %mm4
                    143:
                    144:        punpckldq %mm5, %mm5
                    145:
                    146: ',`
                    147:        movq    LS(rodata_AAAAAAAAAAAAAAAA), %mm7
                    148:        movq    LS(rodata_3333333333333333), %mm6
                    149:        movq    LS(rodata_0F0F0F0F0F0F0F0F), %mm5
                    150: ')
                    151:        pxor    %mm4, %mm4
                    152:
                    153: define(REG_AAAAAAAAAAAAAAAA,%mm7)
                    154: define(REG_3333333333333333,%mm6)
                    155: define(REG_0F0F0F0F0F0F0F0F,%mm5)
                    156: define(REG_0000000000000000,%mm4)
                    157:
                    158:
                    159:        movl    PARAM_SRC, %eax
                    160: HAM(`  movl    PARAM_SRC2, %edx')
                    161:
                    162:        pxor    %mm2, %mm2      C total
                    163:
                    164:        shrl    %ecx
                    165:        jnc     L(top)
                    166:
                    167:        movd    (%eax,%ecx,8), %mm1
                    168:
                    169: HAM(`  movd    0(%edx,%ecx,8), %mm0
                    170:        pxor    %mm0, %mm1
                    171: ')
                    172:        orl     %ecx, %ecx
                    173:        jmp     L(loaded)
                    174:
                    175:
                    176:        ALIGN(16)
                    177: L(top):
                    178:        C eax   src
                    179:        C ebx
                    180:        C ecx   counter, qwords, decrementing
                    181:        C edx   [hamdist] src2
                    182:        C
                    183:        C mm0   (scratch)
                    184:        C mm1   (scratch)
                    185:        C mm2   total (low dword)
                    186:        C mm3
                    187:        C mm4   \
                    188:        C mm5   | special constants
                    189:        C mm6   |
                    190:        C mm7   /
                    191:
                    192:        movq    -8(%eax,%ecx,8), %mm1
                    193:
                    194: HAM(`  pxor    -8(%edx,%ecx,8), %mm1')
                    195:        decl    %ecx
                    196:
                    197: L(loaded):
                    198:        movq    %mm1, %mm0
                    199:        pand    REG_AAAAAAAAAAAAAAAA, %mm1
                    200:
                    201:        psrlq   $1, %mm1
                    202:
                    203:        psubd   %mm1, %mm0      C bit pairs
                    204:
                    205:
                    206:        movq    %mm0, %mm1
                    207:        psrlq   $2, %mm0
                    208:
                    209:        pand    REG_3333333333333333, %mm0
                    210:        pand    REG_3333333333333333, %mm1
                    211:
                    212:        paddd   %mm1, %mm0      C nibbles
                    213:
                    214:
                    215:        movq    %mm0, %mm1
                    216:        psrlq   $4, %mm0
                    217:
                    218:        pand    REG_0F0F0F0F0F0F0F0F, %mm0
                    219:        pand    REG_0F0F0F0F0F0F0F0F, %mm1
                    220:
                    221:        paddd   %mm1, %mm0      C bytes
                    222:
                    223:
                    224:        psadbw_mm4_mm0
                    225:
                    226:        paddd   %mm0, %mm2      C add to total
                    227:        jnz     L(top)
                    228:
                    229:
                    230:        movd    %mm2, %eax
                    231:        emms
                    232:        ret
                    233:
                    234:
                    235: L(zero):
                    236:        movl    $0, %eax
                    237:        ret
                    238:
                    239: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>