[BACK]Return to popham.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / k7 / mmx

Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mmx/popham.asm, Revision 1.1

1.1     ! maekawa     1: dnl  AMD K7 mpn_popcount, mpn_hamdist -- population count and hamming
        !             2: dnl  distance.
        !             3: dnl
        !             4: dnl  K7: popcount 5.0 cycles/limb, hamdist 6.0 cycles/limb
        !             5:
        !             6:
        !             7: dnl  Copyright (C) 2000 Free Software Foundation, Inc.
        !             8: dnl
        !             9: dnl  This file is part of the GNU MP Library.
        !            10: dnl
        !            11: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !            12: dnl  modify it under the terms of the GNU Lesser General Public License as
        !            13: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            14: dnl  License, or (at your option) any later version.
        !            15: dnl
        !            16: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            17: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            18: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            19: dnl  Lesser General Public License for more details.
        !            20: dnl
        !            21: dnl  You should have received a copy of the GNU Lesser General Public
        !            22: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            23: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            24: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            25:
        !            26:
        !            27: include(`../config.m4')
        !            28:
        !            29:
        !            30: dnl  Only recent versions of gas know psadbw, in particular gas 2.9.1 on
        !            31: dnl  FreeBSD 3.3 and 3.4 doesn't recognise it.
        !            32:
        !            33: define(psadbw_mm4_mm0,
        !            34: `ifelse(m4_ifdef_anyof_p(`HAVE_TARGET_CPU_athlon',
        !            35:                          `HAVE_TARGET_CPU_pentium3'),1,
        !            36:        `.byte 0x0f,0xf6,0xc4   C psadbw %mm4, %mm0',
        !            37:
        !            38: `m4_warning(`warning, using simulated and only partly functional psadbw, use for testing only
        !            39: ')     C this works enough for the sum of bytes done below, making it
        !            40:        C possible to test on an older cpu
        !            41:        leal    -8(%esp), %esp
        !            42:        movq    %mm4, (%esp)
        !            43:        movq    %mm0, %mm4
        !            44: forloop(i,1,7,
        !            45: `      psrlq   $ 8, %mm4
        !            46:        paddb   %mm4, %mm0
        !            47: ')
        !            48:        pushl   $ 0
        !            49:        pushl   $ 0xFF
        !            50:        pand    (%esp), %mm0
        !            51:        movq    8(%esp), %mm4
        !            52:        leal    16(%esp), %esp
        !            53: ')')
        !            54:
        !            55:
        !            56: C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
        !            57: C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
        !            58: C
        !            59: C The code here is almost certainly not optimal, but is already a 3x speedup
        !            60: C over the generic C code.  The main improvement would be to interleave
        !            61: C processing of two qwords in the loop so as to fully exploit the available
        !            62: C execution units, possibly leading to 3.25 c/l (13 cycles for 4 limbs).
        !            63: C
        !            64: C The loop is based on the example "Efficient 64-bit population count using
        !            65: C MMX instructions" in the Athlon Optimization Guide, AMD document 22007,
        !            66: C page 158 of rev E (reference in mpn/x86/k7/README).
        !            67:
        !            68: ifdef(`OPERATION_popcount',,
        !            69: `ifdef(`OPERATION_hamdist',,
        !            70: `m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
        !            71: ')')')
        !            72:
        !            73: define(HAM,
        !            74: m4_assert_numargs(1)
        !            75: `ifdef(`OPERATION_hamdist',`$1')')
        !            76:
        !            77: define(POP,
        !            78: m4_assert_numargs(1)
        !            79: `ifdef(`OPERATION_popcount',`$1')')
        !            80:
        !            81: HAM(`
        !            82: defframe(PARAM_SIZE,   12)
        !            83: defframe(PARAM_SRC2,   8)
        !            84: defframe(PARAM_SRC,    4)
        !            85: define(M4_function,mpn_hamdist)
        !            86: ')
        !            87: POP(`
        !            88: defframe(PARAM_SIZE,   8)
        !            89: defframe(PARAM_SRC,    4)
        !            90: define(M4_function,mpn_popcount)
        !            91: ')
        !            92:
        !            93: MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
        !            94:
        !            95:
        !            96: ifdef(`PIC',,`
        !            97:        dnl  non-PIC
        !            98:
        !            99:        .section .rodata
        !           100:        ALIGN(8)
        !           101:
        !           102: define(LS,
        !           103: m4_assert_numargs(1)
        !           104: `LF(M4_function,`$1')')
        !           105:
        !           106: LS(rodata_AAAAAAAAAAAAAAAA):
        !           107:        .long   0xAAAAAAAA
        !           108:        .long   0xAAAAAAAA
        !           109:
        !           110: LS(rodata_3333333333333333):
        !           111:        .long   0x33333333
        !           112:        .long   0x33333333
        !           113:
        !           114: LS(rodata_0F0F0F0F0F0F0F0F):
        !           115:        .long   0x0F0F0F0F
        !           116:        .long   0x0F0F0F0F
        !           117: ')
        !           118:
        !           119:        .text
        !           120:        ALIGN(32)
        !           121:
        !           122: PROLOGUE(M4_function)
        !           123: deflit(`FRAME',0)
        !           124:
        !           125:        movl    PARAM_SIZE, %ecx
        !           126:        orl     %ecx, %ecx
        !           127:        jz      L(zero)
        !           128:
        !           129: ifdef(`PIC',`
        !           130:        movl    $0xAAAAAAAA, %eax
        !           131:        movl    $0x33333333, %edx
        !           132:
        !           133:        movd    %eax, %mm7
        !           134:        movd    %edx, %mm6
        !           135:
        !           136:        movl    $0x0F0F0F0F, %eax
        !           137:
        !           138:        punpckldq %mm7, %mm7
        !           139:        punpckldq %mm6, %mm6
        !           140:
        !           141:        movd    %eax, %mm5
        !           142:        movd    %edx, %mm4
        !           143:
        !           144:        punpckldq %mm5, %mm5
        !           145:
        !           146: ',`
        !           147:        movq    LS(rodata_AAAAAAAAAAAAAAAA), %mm7
        !           148:        movq    LS(rodata_3333333333333333), %mm6
        !           149:        movq    LS(rodata_0F0F0F0F0F0F0F0F), %mm5
        !           150: ')
        !           151:        pxor    %mm4, %mm4
        !           152:
        !           153: define(REG_AAAAAAAAAAAAAAAA,%mm7)
        !           154: define(REG_3333333333333333,%mm6)
        !           155: define(REG_0F0F0F0F0F0F0F0F,%mm5)
        !           156: define(REG_0000000000000000,%mm4)
        !           157:
        !           158:
        !           159:        movl    PARAM_SRC, %eax
        !           160: HAM(`  movl    PARAM_SRC2, %edx')
        !           161:
        !           162:        pxor    %mm2, %mm2      C total
        !           163:
        !           164:        shrl    %ecx
        !           165:        jnc     L(top)
        !           166:
        !           167:        movd    (%eax,%ecx,8), %mm1
        !           168:
        !           169: HAM(`  movd    0(%edx,%ecx,8), %mm0
        !           170:        pxor    %mm0, %mm1
        !           171: ')
        !           172:        orl     %ecx, %ecx
        !           173:        jmp     L(loaded)
        !           174:
        !           175:
        !           176:        ALIGN(16)
        !           177: L(top):
        !           178:        C eax   src
        !           179:        C ebx
        !           180:        C ecx   counter, qwords, decrementing
        !           181:        C edx   [hamdist] src2
        !           182:        C
        !           183:        C mm0   (scratch)
        !           184:        C mm1   (scratch)
        !           185:        C mm2   total (low dword)
        !           186:        C mm3
        !           187:        C mm4   \
        !           188:        C mm5   | special constants
        !           189:        C mm6   |
        !           190:        C mm7   /
        !           191:
        !           192:        movq    -8(%eax,%ecx,8), %mm1
        !           193:
        !           194: HAM(`  pxor    -8(%edx,%ecx,8), %mm1')
        !           195:        decl    %ecx
        !           196:
        !           197: L(loaded):
        !           198:        movq    %mm1, %mm0
        !           199:        pand    REG_AAAAAAAAAAAAAAAA, %mm1
        !           200:
        !           201:        psrlq   $1, %mm1
        !           202:
        !           203:        psubd   %mm1, %mm0      C bit pairs
        !           204:
        !           205:
        !           206:        movq    %mm0, %mm1
        !           207:        psrlq   $2, %mm0
        !           208:
        !           209:        pand    REG_3333333333333333, %mm0
        !           210:        pand    REG_3333333333333333, %mm1
        !           211:
        !           212:        paddd   %mm1, %mm0      C nibbles
        !           213:
        !           214:
        !           215:        movq    %mm0, %mm1
        !           216:        psrlq   $4, %mm0
        !           217:
        !           218:        pand    REG_0F0F0F0F0F0F0F0F, %mm0
        !           219:        pand    REG_0F0F0F0F0F0F0F0F, %mm1
        !           220:
        !           221:        paddd   %mm1, %mm0      C bytes
        !           222:
        !           223:
        !           224:        psadbw_mm4_mm0
        !           225:
        !           226:        paddd   %mm0, %mm2      C add to total
        !           227:        jnz     L(top)
        !           228:
        !           229:
        !           230:        movd    %mm2, %eax
        !           231:        emms
        !           232:        ret
        !           233:
        !           234:
        !           235: L(zero):
        !           236:        movl    $0, %eax
        !           237:        ret
        !           238:
        !           239: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>