[BACK]Return to popham.asm CVS log [TXT][DIR] Up to [local] / OpenXM_contrib / gmp / mpn / x86 / pentium4 / mmx

Annotation of OpenXM_contrib/gmp/mpn/x86/pentium4/mmx/popham.asm, Revision 1.1

1.1     ! ohara       1: dnl  Intel Pentium 4 mpn_popcount, mpn_hamdist -- population count and
        !             2: dnl  hamming distance.
        !             3:
        !             4: dnl  Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
        !             5: dnl
        !             6: dnl  This file is part of the GNU MP Library.
        !             7: dnl
        !             8: dnl  The GNU MP Library is free software; you can redistribute it and/or
        !             9: dnl  modify it under the terms of the GNU Lesser General Public License as
        !            10: dnl  published by the Free Software Foundation; either version 2.1 of the
        !            11: dnl  License, or (at your option) any later version.
        !            12: dnl
        !            13: dnl  The GNU MP Library is distributed in the hope that it will be useful,
        !            14: dnl  but WITHOUT ANY WARRANTY; without even the implied warranty of
        !            15: dnl  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
        !            16: dnl  Lesser General Public License for more details.
        !            17: dnl
        !            18: dnl  You should have received a copy of the GNU Lesser General Public
        !            19: dnl  License along with the GNU MP Library; see the file COPYING.LIB.  If
        !            20: dnl  not, write to the Free Software Foundation, Inc., 59 Temple Place -
        !            21: dnl  Suite 330, Boston, MA 02111-1307, USA.
        !            22:
        !            23: include(`../config.m4')
        !            24:
        !            25:
        !            26: C P4: popcount 8.5 cycles/limb
        !            27: C     hamdist  9.5 cycles/limb
        !            28:
        !            29:
        !            30: C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
        !            31: C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
        !            32: C
        !            33: C Loading with unaligned movq's costs an extra 1 c/l and hence is avoided.
        !            34: C Two movd's and a punpckldq seems to be the same speed as an aligned movq,
        !            35: C and using them saves fiddling about with alignment testing on entry.
        !            36: C
        !            37: C For popcount there's 13 mmx instructions in the loop, so perhaps 6.5 c/l
        !            38: C might be possible, but 8.5 c/l relying on out-of-order execution is
        !            39: C already quite reasonable.
        !            40:
        !            41: ifdef(`OPERATION_popcount',,
        !            42: `ifdef(`OPERATION_hamdist',,
        !            43: `m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
        !            44: ')')')
        !            45:
        !            46: define(HAM,
        !            47: m4_assert_numargs(1)
        !            48: `ifdef(`OPERATION_hamdist',`$1')')
        !            49:
        !            50: define(POP,
        !            51: m4_assert_numargs(1)
        !            52: `ifdef(`OPERATION_popcount',`$1')')
        !            53:
        !            54: HAM(`
        !            55: defframe(PARAM_SIZE, 12)
        !            56: defframe(PARAM_SRC2,  8)
        !            57: defframe(PARAM_SRC,   4)
        !            58: define(M4_function,mpn_hamdist)
        !            59: ')
        !            60: POP(`
        !            61: defframe(PARAM_SIZE,  8)
        !            62: defframe(PARAM_SRC,   4)
        !            63: define(M4_function,mpn_popcount)
        !            64: ')
        !            65:
        !            66: MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
        !            67:
        !            68:
        !            69: ifdef(`PIC',,`
        !            70:        dnl  non-PIC
        !            71:        RODATA
        !            72:        ALIGN(8)
        !            73: L(rodata_AAAAAAAAAAAAAAAA):
        !            74:        .long   0xAAAAAAAA
        !            75:        .long   0xAAAAAAAA
        !            76: L(rodata_3333333333333333):
        !            77:        .long   0x33333333
        !            78:        .long   0x33333333
        !            79: L(rodata_0F0F0F0F0F0F0F0F):
        !            80:        .long   0x0F0F0F0F
        !            81:        .long   0x0F0F0F0F
        !            82: ')
        !            83:
        !            84:        TEXT
        !            85:        ALIGN(16)
        !            86:
        !            87: PROLOGUE(M4_function)
        !            88: deflit(`FRAME',0)
        !            89:
        !            90:        movl    PARAM_SIZE, %ecx
        !            91:        movl    PARAM_SRC, %eax
        !            92:
        !            93: ifdef(`PIC',`
        !            94:        movl    $0xAAAAAAAA, %edx
        !            95:        movd    %edx, %mm7
        !            96:        punpckldq %mm7, %mm7
        !            97:
        !            98:        movl    $0x33333333, %edx
        !            99:        movd    %edx, %mm6
        !           100:        punpckldq %mm6, %mm6
        !           101:
        !           102:        movl    $0x0F0F0F0F, %edx
        !           103:        movd    %edx, %mm5
        !           104:        punpckldq %mm5, %mm5
        !           105:
        !           106: HAM(`  movl    PARAM_SRC2, %edx')
        !           107:
        !           108: ',`
        !           109:        dnl non-PIC
        !           110: HAM(`  movl    PARAM_SRC2, %edx')
        !           111:        movq    L(rodata_AAAAAAAAAAAAAAAA), %mm7
        !           112:        movq    L(rodata_3333333333333333), %mm6
        !           113:        movq    L(rodata_0F0F0F0F0F0F0F0F), %mm5
        !           114: ')
        !           115:
        !           116:        pxor    %mm4, %mm4              C zero
        !           117:        pxor    %mm0, %mm0              C total
        !           118:
        !           119:        subl    $1, %ecx
        !           120:        ja      L(top)
        !           121:
        !           122: L(last):
        !           123:        movd    (%eax,%ecx,4), %mm1             C src high limb
        !           124: HAM(`  movd    (%edx,%ecx,4), %mm2
        !           125:        pxor    %mm2, %mm1
        !           126: ')
        !           127:        jmp     L(loaded)
        !           128:
        !           129:
        !           130: L(top):
        !           131:        C eax   src
        !           132:        C ebx
        !           133:        C ecx   counter, size-1 to 2 or 1, inclusive
        !           134:        C edx   [hamdist] src2
        !           135:        C
        !           136:        C mm0   total (low dword)
        !           137:        C mm1   (scratch)
        !           138:        C mm2   (scratch)
        !           139:        C mm3
        !           140:        C mm4   0x0000000000000000
        !           141:        C mm5   0x0F0F0F0F0F0F0F0F
        !           142:        C mm6   0x3333333333333333
        !           143:        C mm7   0xAAAAAAAAAAAAAAAA
        !           144:
        !           145:        movd    (%eax), %mm1
        !           146:        movd    4(%eax), %mm2
        !           147:        punpckldq %mm2, %mm1
        !           148:        addl    $8, %eax
        !           149:
        !           150: HAM(`  movd    (%edx), %mm2
        !           151:        movd    4(%edx), %mm3
        !           152:        punpckldq %mm3, %mm2
        !           153:        pxor    %mm2, %mm1
        !           154:        addl    $8, %edx
        !           155: ')
        !           156:
        !           157: L(loaded):
        !           158:        movq    %mm7, %mm2
        !           159:        pand    %mm1, %mm2
        !           160:        psrlq   $1, %mm2
        !           161:        psubd   %mm2, %mm1      C bit pairs
        !           162:
        !           163:        movq    %mm6, %mm2
        !           164:        pand    %mm1, %mm2
        !           165:        psrlq   $2, %mm1
        !           166:        pand    %mm6, %mm1
        !           167:        paddd   %mm2, %mm1      C nibbles
        !           168:
        !           169:        movq    %mm5, %mm2
        !           170:        pand    %mm1, %mm2
        !           171:        psrlq   $4, %mm1
        !           172:        pand    %mm5, %mm1
        !           173:        paddd   %mm2, %mm1      C bytes
        !           174:
        !           175:        psadbw( %mm4, %mm1)
        !           176:        paddd   %mm1, %mm0      C to total
        !           177:
        !           178:        subl    $2, %ecx
        !           179:        jg      L(top)
        !           180:
        !           181:        C ecx is 0 or -1 representing respectively 1 or 0 further limbs
        !           182:        jz      L(last)
        !           183:
        !           184:
        !           185:        movd    %mm0, %eax
        !           186:        emms
        !           187:        ret
        !           188:
        !           189: EPILOGUE()

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>