Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mul_1.asm, Revision 1.1
1.1 ! maekawa 1: dnl AMD K7 mpn_mul_1 -- mpn by limb multiply.
! 2: dnl
! 3: dnl K7: 3.4 cycles/limb (at 16 limbs/loop).
! 4:
! 5:
! 6: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
! 7: dnl
! 8: dnl This file is part of the GNU MP Library.
! 9: dnl
! 10: dnl The GNU MP Library is free software; you can redistribute it and/or
! 11: dnl modify it under the terms of the GNU Lesser General Public License as
! 12: dnl published by the Free Software Foundation; either version 2.1 of the
! 13: dnl License, or (at your option) any later version.
! 14: dnl
! 15: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 16: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 17: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 18: dnl Lesser General Public License for more details.
! 19: dnl
! 20: dnl You should have received a copy of the GNU Lesser General Public
! 21: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 22: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 23: dnl Suite 330, Boston, MA 02111-1307, USA.
! 24:
! 25:
! 26: include(`../config.m4')
! 27:
! 28:
! 29: dnl K7: UNROLL_COUNT cycles/limb
! 30: dnl 8 3.9
! 31: dnl 16 3.4
! 32: dnl 32 3.4
! 33: dnl 64 3.35
! 34: dnl Maximum possible with the current code is 64.
! 35:
! 36: deflit(UNROLL_COUNT, 16)
! 37:
! 38:
! 39: C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
! 40: C mp_limb_t multiplier);
! 41: C mp_limb_t mpn_mul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
! 42: C mp_limb_t multiplier, mp_limb_t carry);
! 43: C
! 44: C Multiply src,size by mult and store the result in dst,size.
! 45: C Return the carry limb from the top of the result.
! 46: C
! 47: C mpn_mul_1c() accepts an initial carry for the calculation, it's added into
! 48: C the low limb of the destination.
! 49: C
! 50: C Variations on the unrolled loop have been tried, with the current
! 51: C registers or with the counter on the stack to free up ecx. The current
! 52: C code is the fastest found.
! 53: C
! 54: C An interesting effect is that removing the stores "movl %ebx, disp0(%edi)"
! 55: C from the unrolled loop actually slows it down to 5.0 cycles/limb. Code
! 56: C with this change can be tested on sizes of the form UNROLL_COUNT*n+1
! 57: C without having to change the computed jump. There's obviously something
! 58: C fishy going on, perhaps with what execution units the mul needs.
! 59:
! 60: defframe(PARAM_CARRY, 20)
! 61: defframe(PARAM_MULTIPLIER,16)
! 62: defframe(PARAM_SIZE, 12)
! 63: defframe(PARAM_SRC, 8)
! 64: defframe(PARAM_DST, 4)
! 65:
! 66: defframe(SAVE_EBP, -4)
! 67: defframe(SAVE_EDI, -8)
! 68: defframe(SAVE_ESI, -12)
! 69: defframe(SAVE_EBX, -16)
! 70: deflit(STACK_SPACE, 16)
! 71:
! 72: dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
! 73: ifdef(`PIC',`
! 74: deflit(UNROLL_THRESHOLD, 7)
! 75: ',`
! 76: deflit(UNROLL_THRESHOLD, 5)
! 77: ')
! 78:
! 79: .text
! 80: ALIGN(32)
! 81: PROLOGUE(mpn_mul_1c)
! 82: deflit(`FRAME',0)
! 83: movl PARAM_CARRY, %edx
! 84: jmp LF(mpn_mul_1,start_nc)
! 85: EPILOGUE()
! 86:
! 87:
! 88: PROLOGUE(mpn_mul_1)
! 89: deflit(`FRAME',0)
! 90: xorl %edx, %edx C initial carry
! 91: L(start_nc):
! 92: movl PARAM_SIZE, %ecx
! 93: subl $STACK_SPACE, %esp
! 94: deflit(`FRAME', STACK_SPACE)
! 95:
! 96: movl %edi, SAVE_EDI
! 97: movl %ebx, SAVE_EBX
! 98: movl %edx, %ebx
! 99:
! 100: movl %esi, SAVE_ESI
! 101: movl PARAM_SRC, %esi
! 102: cmpl $UNROLL_THRESHOLD, %ecx
! 103:
! 104: movl PARAM_DST, %edi
! 105: movl %ebp, SAVE_EBP
! 106: jae L(unroll)
! 107:
! 108: leal (%esi,%ecx,4), %esi
! 109: leal (%edi,%ecx,4), %edi
! 110: negl %ecx
! 111:
! 112: movl PARAM_MULTIPLIER, %ebp
! 113:
! 114: L(simple):
! 115: C eax scratch
! 116: C ebx carry
! 117: C ecx counter (negative)
! 118: C edx scratch
! 119: C esi src
! 120: C edi dst
! 121: C ebp multiplier
! 122:
! 123: movl (%esi,%ecx,4), %eax
! 124:
! 125: mull %ebp
! 126:
! 127: addl %ebx, %eax
! 128: movl %eax, (%edi,%ecx,4)
! 129: movl $0, %ebx
! 130:
! 131: adcl %edx, %ebx
! 132: incl %ecx
! 133: jnz L(simple)
! 134:
! 135: movl %ebx, %eax
! 136: movl SAVE_EBX, %ebx
! 137: movl SAVE_ESI, %esi
! 138:
! 139: movl SAVE_EDI, %edi
! 140: movl SAVE_EBP, %ebp
! 141: addl $STACK_SPACE, %esp
! 142:
! 143: ret
! 144:
! 145:
! 146: C -----------------------------------------------------------------------------
! 147: C The mov to load the next source limb is done well ahead of the mul, this
! 148: C is necessary for full speed. It leads to one limb handled separately
! 149: C after the loop.
! 150: C
! 151: C When unrolling to 32 or more, an offset of +4 is used on the src pointer,
! 152: C to avoid having an 0x80 displacement in the code for the last limb in the
! 153: C unrolled loop. This is for a fair comparison between 16 and 32 unrolling.
! 154:
! 155: ifelse(eval(UNROLL_COUNT >= 32),1,`
! 156: deflit(SRC_OFFSET,4)
! 157: ',`
! 158: deflit(SRC_OFFSET,)
! 159: ')
! 160:
! 161: C this is offset 0x62, so close enough to aligned
! 162: L(unroll):
! 163: C eax
! 164: C ebx initial carry
! 165: C ecx size
! 166: C edx
! 167: C esi src
! 168: C edi dst
! 169: C ebp
! 170: deflit(`FRAME', STACK_SPACE)
! 171:
! 172: leal -1(%ecx), %edx C one limb handled at end
! 173: leal -2(%ecx), %ecx C and ecx is one less than edx
! 174: movl %ebp, SAVE_EBP
! 175:
! 176: negl %edx
! 177: shrl $UNROLL_LOG2, %ecx C unrolled loop counter
! 178: movl (%esi), %eax C src low limb
! 179:
! 180: andl $UNROLL_MASK, %edx
! 181: movl PARAM_DST, %edi
! 182:
! 183: movl %edx, %ebp
! 184: shll $4, %edx
! 185:
! 186: C 17 code bytes per limb
! 187: ifdef(`PIC',`
! 188: call L(add_eip_to_edx)
! 189: L(here):
! 190: ',`
! 191: leal L(entry) (%edx,%ebp), %edx
! 192: ')
! 193: negl %ebp
! 194:
! 195: leal ifelse(UNROLL_BYTES,256,128+) SRC_OFFSET(%esi,%ebp,4), %esi
! 196: leal ifelse(UNROLL_BYTES,256,128) (%edi,%ebp,4), %edi
! 197: movl PARAM_MULTIPLIER, %ebp
! 198:
! 199: jmp *%edx
! 200:
! 201:
! 202: ifdef(`PIC',`
! 203: L(add_eip_to_edx):
! 204: C See README.family about old gas bugs
! 205: leal (%edx,%ebp), %edx
! 206: addl $L(entry)-L(here), %edx
! 207: addl (%esp), %edx
! 208: ret
! 209: ')
! 210:
! 211:
! 212: C ----------------------------------------------------------------------------
! 213: ALIGN(32)
! 214: L(top):
! 215: C eax next src limb
! 216: C ebx carry
! 217: C ecx counter
! 218: C edx scratch
! 219: C esi src+4
! 220: C edi dst
! 221: C ebp multiplier
! 222: C
! 223: C 17 code bytes per limb processed
! 224:
! 225: L(entry):
! 226: forloop(i, 0, UNROLL_COUNT-1, `
! 227: deflit(`disp_dst', eval(i*4 ifelse(UNROLL_BYTES,256,-128)))
! 228: deflit(`disp_src', eval(disp_dst + 4-(SRC_OFFSET-0)))
! 229:
! 230: mull %ebp
! 231:
! 232: addl %eax, %ebx
! 233: Zdisp( movl, disp_src,(%esi), %eax)
! 234: Zdisp( movl, %ebx, disp_dst,(%edi))
! 235:
! 236: movl $0, %ebx
! 237: adcl %edx, %ebx
! 238: ')
! 239:
! 240: decl %ecx
! 241:
! 242: leal UNROLL_BYTES(%esi), %esi
! 243: leal UNROLL_BYTES(%edi), %edi
! 244: jns L(top)
! 245:
! 246:
! 247: deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
! 248:
! 249: mull %ebp
! 250:
! 251: addl %eax, %ebx
! 252: movl $0, %eax
! 253: movl SAVE_ESI, %esi
! 254:
! 255: movl %ebx, disp0(%edi)
! 256: movl SAVE_EBX, %ebx
! 257: movl SAVE_EDI, %edi
! 258:
! 259: adcl %edx, %eax
! 260: movl SAVE_EBP, %ebp
! 261: addl $STACK_SPACE, %esp
! 262:
! 263: ret
! 264:
! 265: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>