Annotation of OpenXM_contrib/gmp/mpn/x86/pentium4/mmx/popham.asm, Revision 1.1
1.1 ! ohara 1: dnl Intel Pentium 4 mpn_popcount, mpn_hamdist -- population count and
! 2: dnl hamming distance.
! 3:
! 4: dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
! 5: dnl
! 6: dnl This file is part of the GNU MP Library.
! 7: dnl
! 8: dnl The GNU MP Library is free software; you can redistribute it and/or
! 9: dnl modify it under the terms of the GNU Lesser General Public License as
! 10: dnl published by the Free Software Foundation; either version 2.1 of the
! 11: dnl License, or (at your option) any later version.
! 12: dnl
! 13: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 14: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 15: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 16: dnl Lesser General Public License for more details.
! 17: dnl
! 18: dnl You should have received a copy of the GNU Lesser General Public
! 19: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 20: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 21: dnl Suite 330, Boston, MA 02111-1307, USA.
! 22:
! 23: include(`../config.m4')
! 24:
! 25:
! 26: C P4: popcount 8.5 cycles/limb
! 27: C hamdist 9.5 cycles/limb
! 28:
! 29:
! 30: C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
! 31: C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
! 32: C
! 33: C Loading with unaligned movq's costs an extra 1 c/l and hence is avoided.
! 34: C Two movd's and a punpckldq seems to be the same speed as an aligned movq,
! 35: C and using them saves fiddling about with alignment testing on entry.
! 36: C
! 37: C For popcount there's 13 mmx instructions in the loop, so perhaps 6.5 c/l
! 38: C might be possible, but 8.5 c/l relying on out-of-order execution is
! 39: C already quite reasonable.
! 40:
! 41: ifdef(`OPERATION_popcount',,
! 42: `ifdef(`OPERATION_hamdist',,
! 43: `m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
! 44: ')')')
! 45:
! 46: define(HAM,
! 47: m4_assert_numargs(1)
! 48: `ifdef(`OPERATION_hamdist',`$1')')
! 49:
! 50: define(POP,
! 51: m4_assert_numargs(1)
! 52: `ifdef(`OPERATION_popcount',`$1')')
! 53:
! 54: HAM(`
! 55: defframe(PARAM_SIZE, 12)
! 56: defframe(PARAM_SRC2, 8)
! 57: defframe(PARAM_SRC, 4)
! 58: define(M4_function,mpn_hamdist)
! 59: ')
! 60: POP(`
! 61: defframe(PARAM_SIZE, 8)
! 62: defframe(PARAM_SRC, 4)
! 63: define(M4_function,mpn_popcount)
! 64: ')
! 65:
! 66: MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
! 67:
! 68:
! 69: ifdef(`PIC',,`
! 70: dnl non-PIC
! 71: RODATA
! 72: ALIGN(8)
! 73: L(rodata_AAAAAAAAAAAAAAAA):
! 74: .long 0xAAAAAAAA
! 75: .long 0xAAAAAAAA
! 76: L(rodata_3333333333333333):
! 77: .long 0x33333333
! 78: .long 0x33333333
! 79: L(rodata_0F0F0F0F0F0F0F0F):
! 80: .long 0x0F0F0F0F
! 81: .long 0x0F0F0F0F
! 82: ')
! 83:
! 84: TEXT
! 85: ALIGN(16)
! 86:
! 87: PROLOGUE(M4_function)
! 88: deflit(`FRAME',0)
! 89:
! 90: movl PARAM_SIZE, %ecx
! 91: movl PARAM_SRC, %eax
! 92:
! 93: ifdef(`PIC',`
! 94: movl $0xAAAAAAAA, %edx
! 95: movd %edx, %mm7
! 96: punpckldq %mm7, %mm7
! 97:
! 98: movl $0x33333333, %edx
! 99: movd %edx, %mm6
! 100: punpckldq %mm6, %mm6
! 101:
! 102: movl $0x0F0F0F0F, %edx
! 103: movd %edx, %mm5
! 104: punpckldq %mm5, %mm5
! 105:
! 106: HAM(` movl PARAM_SRC2, %edx')
! 107:
! 108: ',`
! 109: dnl non-PIC
! 110: HAM(` movl PARAM_SRC2, %edx')
! 111: movq L(rodata_AAAAAAAAAAAAAAAA), %mm7
! 112: movq L(rodata_3333333333333333), %mm6
! 113: movq L(rodata_0F0F0F0F0F0F0F0F), %mm5
! 114: ')
! 115:
! 116: pxor %mm4, %mm4 C zero
! 117: pxor %mm0, %mm0 C total
! 118:
! 119: subl $1, %ecx
! 120: ja L(top)
! 121:
! 122: L(last):
! 123: movd (%eax,%ecx,4), %mm1 C src high limb
! 124: HAM(` movd (%edx,%ecx,4), %mm2
! 125: pxor %mm2, %mm1
! 126: ')
! 127: jmp L(loaded)
! 128:
! 129:
! 130: L(top):
! 131: C eax src
! 132: C ebx
! 133: C ecx counter, size-1 to 2 or 1, inclusive
! 134: C edx [hamdist] src2
! 135: C
! 136: C mm0 total (low dword)
! 137: C mm1 (scratch)
! 138: C mm2 (scratch)
! 139: C mm3
! 140: C mm4 0x0000000000000000
! 141: C mm5 0x0F0F0F0F0F0F0F0F
! 142: C mm6 0x3333333333333333
! 143: C mm7 0xAAAAAAAAAAAAAAAA
! 144:
! 145: movd (%eax), %mm1
! 146: movd 4(%eax), %mm2
! 147: punpckldq %mm2, %mm1
! 148: addl $8, %eax
! 149:
! 150: HAM(` movd (%edx), %mm2
! 151: movd 4(%edx), %mm3
! 152: punpckldq %mm3, %mm2
! 153: pxor %mm2, %mm1
! 154: addl $8, %edx
! 155: ')
! 156:
! 157: L(loaded):
! 158: movq %mm7, %mm2
! 159: pand %mm1, %mm2
! 160: psrlq $1, %mm2
! 161: psubd %mm2, %mm1 C bit pairs
! 162:
! 163: movq %mm6, %mm2
! 164: pand %mm1, %mm2
! 165: psrlq $2, %mm1
! 166: pand %mm6, %mm1
! 167: paddd %mm2, %mm1 C nibbles
! 168:
! 169: movq %mm5, %mm2
! 170: pand %mm1, %mm2
! 171: psrlq $4, %mm1
! 172: pand %mm5, %mm1
! 173: paddd %mm2, %mm1 C bytes
! 174:
! 175: psadbw( %mm4, %mm1)
! 176: paddd %mm1, %mm0 C to total
! 177:
! 178: subl $2, %ecx
! 179: jg L(top)
! 180:
! 181: C ecx is 0 or -1 representing respectively 1 or 0 further limbs
! 182: jz L(last)
! 183:
! 184:
! 185: movd %mm0, %eax
! 186: emms
! 187: ret
! 188:
! 189: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>