[BACK]Return to popham.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k7 / mmx

Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mmx/popham.asm, Revision 1.1.1.3

1.1       maekawa     1: dnl  AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming
                      2: dnl  distance.
                      3:
1.1.1.3 ! ohara       4: dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
1.1       maekawa     5: dnl
                      6: dnl  This file is part of the GNU MP Library.
                      7: dnl
                      8: dnl  The GNU MP Library is free software; you can redistribute it and/or
                      9: dnl  modify it under the terms of the GNU Lesser General Public License as
                     10: dnl  published by the Free Software Foundation; either version 2.1 of the
                     11: dnl  License, or (at your option) any later version.
                     12: dnl
                     13: dnl  The GNU MP Library is distributed in the hope that it will be useful,
                     14: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
                     15: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
                     16: dnl  Lesser General Public License for more details.
                     17: dnl
                     18: dnl  You should have received a copy of the GNU Lesser General Public
                     19: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
                     20: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
                     21: dnl  Suite 330, Boston, MA 02111-1307, USA.
                     22:
                     23: include(`../config.m4')
                     24:
                     25:
1.1.1.3 ! ohara      26: C K7: popcount 5.0 cycles/limb, hamdist 6.0 cycles/limb
1.1       maekawa    27:
                     28:
                     29: C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
                     30: C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
                     31: C
                     32: C The code here is almost certainly not optimal, but is already a 3x speedup
                     33: C over the generic C code.  The main improvement would be to interleave
                     34: C processing of two qwords in the loop so as to fully exploit the available
                     35: C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs).
                     36: C
                     37: C The loop is based on the example "Efficient 64-bit population count using
                     38: C MMX instructions" in the Athlon Optimization Guide, AMD document 22007,
                     39: C page 158 of rev E (reference in mpn/x86/k7/README).
                     40:
                     41: ifdef(`OPERATION_popcount',,
                     42: `ifdef(`OPERATION_hamdist',,
                     43: `m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
                     44: ')')')
                     45:
                     46: define(HAM,
                     47: m4_assert_numargs(1)
                     48: `ifdef(`OPERATION_hamdist',`$1')')
                     49:
                     50: define(POP,
                     51: m4_assert_numargs(1)
                     52: `ifdef(`OPERATION_popcount',`$1')')
                     53:
                     54: HAM(`
                     55: defframe(PARAM_SIZE,   12)
                     56: defframe(PARAM_SRC2,   8)
                     57: defframe(PARAM_SRC,    4)
                     58: define(M4_function,mpn_hamdist)
                     59: ')
                     60: POP(`
                     61: defframe(PARAM_SIZE,   8)
                     62: defframe(PARAM_SRC,    4)
                     63: define(M4_function,mpn_popcount)
                     64: ')
                     65:
                     66: MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
                     67:
                     68:
                     69: ifdef(`PIC',,`
                     70:        dnl  non-PIC
                     71:
1.1.1.3 ! ohara      72:        RODATA
1.1       maekawa    73:        ALIGN(8)
                     74:
1.1.1.3 ! ohara      75: L(rodata_AAAAAAAAAAAAAAAA):
1.1       maekawa    76:        .long   0xAAAAAAAA
                     77:        .long   0xAAAAAAAA
                     78:
1.1.1.3 ! ohara      79: L(rodata_3333333333333333):
1.1       maekawa    80:        .long   0x33333333
                     81:        .long   0x33333333
                     82:
1.1.1.3 ! ohara      83: L(rodata_0F0F0F0F0F0F0F0F):
1.1       maekawa    84:        .long   0x0F0F0F0F
                     85:        .long   0x0F0F0F0F
                     86: ')
                     87:
1.1.1.3 ! ohara      88:        TEXT
1.1       maekawa    89:        ALIGN(32)
                     90:
                     91: PROLOGUE(M4_function)
                     92: deflit(`FRAME',0)
                     93:
                     94:        movl    PARAM_SIZE, %ecx
                     95:
                     96: ifdef(`PIC',`
                     97:        movl    $0xAAAAAAAA, %eax
                     98:        movl    $0x33333333, %edx
                     99:
                    100:        movd    %eax, %mm7
                    101:        movd    %edx, %mm6
                    102:
                    103:        movl    $0x0F0F0F0F, %eax
                    104:
                    105:        punpckldq %mm7, %mm7
                    106:        punpckldq %mm6, %mm6
                    107:
                    108:        movd    %eax, %mm5
                    109:        movd    %edx, %mm4
                    110:
                    111:        punpckldq %mm5, %mm5
                    112:
                    113: ',`
1.1.1.3 ! ohara     114:        movq    L(rodata_AAAAAAAAAAAAAAAA), %mm7
        !           115:        movq    L(rodata_3333333333333333), %mm6
        !           116:        movq    L(rodata_0F0F0F0F0F0F0F0F), %mm5
1.1       maekawa   117: ')
                    118:        pxor    %mm4, %mm4
                    119:
                    120: define(REG_AAAAAAAAAAAAAAAA,%mm7)
                    121: define(REG_3333333333333333,%mm6)
                    122: define(REG_0F0F0F0F0F0F0F0F,%mm5)
                    123: define(REG_0000000000000000,%mm4)
                    124:
                    125:
                    126:        movl    PARAM_SRC, %eax
                    127: HAM(`  movl    PARAM_SRC2, %edx')
                    128:
                    129:        pxor    %mm2, %mm2      C total
                    130:
                    131:        shrl    %ecx
                    132:        jnc     L(top)
                    133:
                    134:        movd    (%eax,%ecx,8), %mm1
                    135:
1.1.1.3 ! ohara     136: HAM(`  movd    (%edx,%ecx,8), %mm0
1.1       maekawa   137:        pxor    %mm0, %mm1
                    138: ')
                    139:        orl     %ecx, %ecx
                    140:        jmp     L(loaded)
                    141:
                    142:
                    143:        ALIGN(16)
                    144: L(top):
                    145:        C eax   src
                    146:        C ebx
                    147:        C ecx   counter, qwords, decrementing
                    148:        C edx   [hamdist] src2
                    149:        C
                    150:        C mm0   (scratch)
                    151:        C mm1   (scratch)
                    152:        C mm2   total (low dword)
                    153:        C mm3
                    154:        C mm4   \
                    155:        C mm5   | special constants
                    156:        C mm6   |
                    157:        C mm7   /
                    158:
                    159:        movq    -8(%eax,%ecx,8), %mm1
                    160:
                    161: HAM(`  pxor    -8(%edx,%ecx,8), %mm1')
                    162:        decl    %ecx
                    163:
                    164: L(loaded):
                    165:        movq    %mm1, %mm0
                    166:        pand    REG_AAAAAAAAAAAAAAAA, %mm1
                    167:
                    168:        psrlq   $1, %mm1
                    169:
                    170:        psubd   %mm1, %mm0      C bit pairs
                    171:
                    172:
                    173:        movq    %mm0, %mm1
                    174:        psrlq   $2, %mm0
                    175:
                    176:        pand    REG_3333333333333333, %mm0
                    177:        pand    REG_3333333333333333, %mm1
                    178:
                    179:        paddd   %mm1, %mm0      C nibbles
                    180:
                    181:
                    182:        movq    %mm0, %mm1
                    183:        psrlq   $4, %mm0
                    184:
                    185:        pand    REG_0F0F0F0F0F0F0F0F, %mm0
                    186:        pand    REG_0F0F0F0F0F0F0F0F, %mm1
                    187:
                    188:        paddd   %mm1, %mm0      C bytes
                    189:
                    190:
1.1.1.3 ! ohara     191:        psadbw( %mm4, %mm0)
1.1       maekawa   192:
                    193:        paddd   %mm0, %mm2      C add to total
                    194:        jnz     L(top)
                    195:
                    196:
                    197:        movd    %mm2, %eax
                    198:        emms
                    199:        ret
                    200:
                    201: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>