Annotation of OpenXM_contrib/gmp/mpn/x86/p6/aorsmul_1.asm, Revision 1.1
1.1 ! maekawa 1: dnl Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
! 2: dnl
! 3: dnl P6: 6.35 cycles/limb (at 16 limbs/loop).
! 4:
! 5:
! 6: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
! 7: dnl
! 8: dnl This file is part of the GNU MP Library.
! 9: dnl
! 10: dnl The GNU MP Library is free software; you can redistribute it and/or
! 11: dnl modify it under the terms of the GNU Lesser General Public License as
! 12: dnl published by the Free Software Foundation; either version 2.1 of the
! 13: dnl License, or (at your option) any later version.
! 14: dnl
! 15: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 16: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 17: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 18: dnl Lesser General Public License for more details.
! 19: dnl
! 20: dnl You should have received a copy of the GNU Lesser General Public
! 21: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 22: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 23: dnl Suite 330, Boston, MA 02111-1307, USA.
! 24:
! 25:
! 26: include(`../config.m4')
! 27:
! 28:
! 29: dnl P6 UNROLL_COUNT cycles/limb
! 30: dnl 8 6.7
! 31: dnl 16 6.35
! 32: dnl 32 6.3
! 33: dnl 64 6.3
! 34: dnl Maximum possible with the current code is 64.
! 35:
! 36: deflit(UNROLL_COUNT, 16)
! 37:
! 38:
! 39: ifdef(`OPERATION_addmul_1', `
! 40: define(M4_inst, addl)
! 41: define(M4_function_1, mpn_addmul_1)
! 42: define(M4_function_1c, mpn_addmul_1c)
! 43: define(M4_description, add it to)
! 44: define(M4_desc_retval, carry)
! 45: ',`ifdef(`OPERATION_submul_1', `
! 46: define(M4_inst, subl)
! 47: define(M4_function_1, mpn_submul_1)
! 48: define(M4_function_1c, mpn_submul_1c)
! 49: define(M4_description, subtract it from)
! 50: define(M4_desc_retval, borrow)
! 51: ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
! 52: ')')')
! 53:
! 54: MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
! 55:
! 56:
! 57: C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
! 58: C mp_limb_t mult);
! 59: C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
! 60: C mp_limb_t mult, mp_limb_t carry);
! 61: C
! 62: C Calculate src,size multiplied by mult and M4_description dst,size.
! 63: C Return the M4_desc_retval limb from the top of the result.
! 64: C
! 65: C This code is pretty much the same as the K6 code. The unrolled loop is
! 66: C the same, but there's just a few scheduling tweaks in the setups and the
! 67: C simple loop.
! 68: C
! 69: C A number of variations have been tried for the unrolled loop, with one or
! 70: C two carries, and with loads scheduled earlier, but nothing faster than 6
! 71: C cycles/limb has been found.
! 72:
! 73: ifdef(`PIC',`
! 74: deflit(UNROLL_THRESHOLD, 5)
! 75: ',`
! 76: deflit(UNROLL_THRESHOLD, 5)
! 77: ')
! 78:
! 79: defframe(PARAM_CARRY, 20)
! 80: defframe(PARAM_MULTIPLIER,16)
! 81: defframe(PARAM_SIZE, 12)
! 82: defframe(PARAM_SRC, 8)
! 83: defframe(PARAM_DST, 4)
! 84:
! 85: .text
! 86: ALIGN(32)
! 87:
! 88: PROLOGUE(M4_function_1c)
! 89: pushl %ebx
! 90: deflit(`FRAME',4)
! 91: movl PARAM_CARRY, %ebx
! 92: jmp LF(M4_function_1,start_nc)
! 93: EPILOGUE()
! 94:
! 95: PROLOGUE(M4_function_1)
! 96: push %ebx
! 97: deflit(`FRAME',4)
! 98: xorl %ebx, %ebx C initial carry
! 99:
! 100: L(start_nc):
! 101: movl PARAM_SIZE, %ecx
! 102: pushl %esi
! 103: deflit(`FRAME',8)
! 104:
! 105: movl PARAM_SRC, %esi
! 106: pushl %edi
! 107: deflit(`FRAME',12)
! 108:
! 109: movl PARAM_DST, %edi
! 110: pushl %ebp
! 111: deflit(`FRAME',16)
! 112: cmpl $UNROLL_THRESHOLD, %ecx
! 113:
! 114: movl PARAM_MULTIPLIER, %ebp
! 115: jae L(unroll)
! 116:
! 117:
! 118: C simple loop
! 119: C this is offset 0x22, so close enough to aligned
! 120: L(simple):
! 121: C eax scratch
! 122: C ebx carry
! 123: C ecx counter
! 124: C edx scratch
! 125: C esi src
! 126: C edi dst
! 127: C ebp multiplier
! 128:
! 129: movl (%esi), %eax
! 130: addl $4, %edi
! 131:
! 132: mull %ebp
! 133:
! 134: addl %ebx, %eax
! 135: adcl $0, %edx
! 136:
! 137: M4_inst %eax, -4(%edi)
! 138: movl %edx, %ebx
! 139:
! 140: adcl $0, %ebx
! 141: decl %ecx
! 142:
! 143: leal 4(%esi), %esi
! 144: jnz L(simple)
! 145:
! 146:
! 147: popl %ebp
! 148: popl %edi
! 149:
! 150: popl %esi
! 151: movl %ebx, %eax
! 152:
! 153: popl %ebx
! 154: ret
! 155:
! 156:
! 157:
! 158: C------------------------------------------------------------------------------
! 159: C VAR_JUMP holds the computed jump temporarily because there's not enough
! 160: C registers when doing the mul for the initial two carry limbs.
! 161: C
! 162: C The add/adc for the initial carry in %ebx is necessary only for the
! 163: C mpn_add/submul_1c entry points. Duplicating the startup code to
! 164: C eliminiate this for the plain mpn_add/submul_1 doesn't seem like a good
! 165: C idea.
! 166:
! 167: dnl overlapping with parameters already fetched
! 168: define(VAR_COUNTER,`PARAM_SIZE')
! 169: define(VAR_JUMP, `PARAM_DST')
! 170:
! 171: C this is offset 0x43, so close enough to aligned
! 172: L(unroll):
! 173: C eax
! 174: C ebx initial carry
! 175: C ecx size
! 176: C edx
! 177: C esi src
! 178: C edi dst
! 179: C ebp
! 180:
! 181: movl %ecx, %edx
! 182: decl %ecx
! 183:
! 184: subl $2, %edx
! 185: negl %ecx
! 186:
! 187: shrl $UNROLL_LOG2, %edx
! 188: andl $UNROLL_MASK, %ecx
! 189:
! 190: movl %edx, VAR_COUNTER
! 191: movl %ecx, %edx
! 192:
! 193: C 15 code bytes per limb
! 194: ifdef(`PIC',`
! 195: call L(pic_calc)
! 196: L(here):
! 197: ',`
! 198: shll $4, %edx
! 199: negl %ecx
! 200:
! 201: leal L(entry) (%edx,%ecx,1), %edx
! 202: ')
! 203: movl (%esi), %eax C src low limb
! 204:
! 205: movl %edx, VAR_JUMP
! 206: leal ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi
! 207:
! 208: mull %ebp
! 209:
! 210: addl %ebx, %eax C initial carry (from _1c)
! 211: adcl $0, %edx
! 212:
! 213: movl %edx, %ebx C high carry
! 214: leal ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi
! 215:
! 216: movl VAR_JUMP, %edx
! 217: testl $1, %ecx
! 218: movl %eax, %ecx C low carry
! 219:
! 220: cmovnz( %ebx, %ecx) C high,low carry other way around
! 221: cmovnz( %eax, %ebx)
! 222:
! 223: jmp *%edx
! 224:
! 225:
! 226: ifdef(`PIC',`
! 227: L(pic_calc):
! 228: shll $4, %edx
! 229: negl %ecx
! 230:
! 231: C See README.family about old gas bugs
! 232: leal (%edx,%ecx,1), %edx
! 233: addl $L(entry)-L(here), %edx
! 234:
! 235: addl (%esp), %edx
! 236:
! 237: ret
! 238: ')
! 239:
! 240:
! 241: C -----------------------------------------------------------
! 242: ALIGN(32)
! 243: L(top):
! 244: deflit(`FRAME',16)
! 245: C eax scratch
! 246: C ebx carry hi
! 247: C ecx carry lo
! 248: C edx scratch
! 249: C esi src
! 250: C edi dst
! 251: C ebp multiplier
! 252: C
! 253: C VAR_COUNTER loop counter
! 254: C
! 255: C 15 code bytes per limb
! 256:
! 257: addl $UNROLL_BYTES, %edi
! 258:
! 259: L(entry):
! 260: deflit(CHUNK_COUNT,2)
! 261: forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
! 262: deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
! 263: deflit(`disp1', eval(disp0 + 4))
! 264:
! 265: Zdisp( movl, disp0,(%esi), %eax)
! 266: mull %ebp
! 267: Zdisp( M4_inst,%ecx, disp0,(%edi))
! 268: adcl %eax, %ebx
! 269: movl %edx, %ecx
! 270: adcl $0, %ecx
! 271:
! 272: movl disp1(%esi), %eax
! 273: mull %ebp
! 274: M4_inst %ebx, disp1(%edi)
! 275: adcl %eax, %ecx
! 276: movl %edx, %ebx
! 277: adcl $0, %ebx
! 278: ')
! 279:
! 280: decl VAR_COUNTER
! 281: leal UNROLL_BYTES(%esi), %esi
! 282:
! 283: jns L(top)
! 284:
! 285:
! 286: deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
! 287:
! 288: M4_inst %ecx, disp0(%edi)
! 289: movl %ebx, %eax
! 290:
! 291: popl %ebp
! 292: popl %edi
! 293:
! 294: popl %esi
! 295: popl %ebx
! 296: adcl $0, %eax
! 297:
! 298: ret
! 299:
! 300: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>