Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/mmx/mul_1.asm, Revision 1.1
1.1 ! ohara 1: dnl Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication.
! 2:
! 3: dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
! 4: dnl
! 5: dnl This file is part of the GNU MP Library.
! 6: dnl
! 7: dnl The GNU MP Library is free software; you can redistribute it and/or
! 8: dnl modify it under the terms of the GNU Lesser General Public License as
! 9: dnl published by the Free Software Foundation; either version 2.1 of the
! 10: dnl License, or (at your option) any later version.
! 11: dnl
! 12: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 13: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 14: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 15: dnl Lesser General Public License for more details.
! 16: dnl
! 17: dnl You should have received a copy of the GNU Lesser General Public
! 18: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 19: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 20: dnl Suite 330, Boston, MA 02111-1307, USA.
! 21:
! 22: include(`../config.m4')
! 23:
! 24:
! 25: C cycles/limb
! 26: C P5: 12.0 for 32-bit multiplier
! 27: C 7.0 for 16-bit multiplier
! 28:
! 29:
! 30: C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
! 31: C mp_limb_t multiplier);
! 32: C
! 33: C When the multiplier is 16 bits some special case MMX code is used. Small
! 34: C multipliers might arise reasonably often from mpz_mul_ui etc. If the size
! 35: C is odd there's roughly a 5 cycle penalty, so times for say size==7 and
! 36: C size==8 end up being quite close. If src isn't aligned to an 8 byte
! 37: C boundary then one limb is processed separately with roughly a 5 cycle
! 38: C penalty, so in that case it's say size==8 and size==9 which are close.
! 39: C
! 40: C Alternatives:
! 41: C
! 42: C MMX is not believed to be of any use for 32-bit multipliers, since for
! 43: C instance the current method would just have to be more or less duplicated
! 44: C for the high and low halves of the multiplier, and would probably
! 45: C therefore run at about 14 cycles, which is slower than the plain integer
! 46: C at 12.
! 47: C
! 48: C Adding the high and low MMX products using integer code seems best. An
! 49: C attempt at using paddd and carry bit propagation with pcmpgtd didn't give
! 50: C any joy. Perhaps something could be done keeping the values signed and
! 51: C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or
! 52: C perhaps not.
! 53: C
! 54: C Future:
! 55: C
! 56: C An mpn_mul_1c entrypoint would need a double carry out of the low result
! 57: C limb in the 16-bit code, unless it could be assumed the carry fits in 16
! 58: C bits, possibly as carry<multiplier, this being true of a big calculation
! 59: C done piece by piece. But let's worry about that if/when mul_1c is
! 60: C actually used.
! 61:
! 62: defframe(PARAM_MULTIPLIER,16)
! 63: defframe(PARAM_SIZE, 12)
! 64: defframe(PARAM_SRC, 8)
! 65: defframe(PARAM_DST, 4)
! 66:
! 67: TEXT
! 68:
! 69: ALIGN(8)
! 70: PROLOGUE(mpn_mul_1)
! 71: deflit(`FRAME',0)
! 72:
! 73: movl PARAM_SIZE, %ecx
! 74: movl PARAM_SRC, %edx
! 75:
! 76: cmpl $1, %ecx
! 77: jne L(two_or_more)
! 78:
! 79: C one limb only
! 80:
! 81: movl PARAM_MULTIPLIER, %eax
! 82: movl PARAM_DST, %ecx
! 83:
! 84: mull (%edx)
! 85:
! 86: movl %eax, (%ecx)
! 87: movl %edx, %eax
! 88:
! 89: ret
! 90:
! 91:
! 92: L(two_or_more):
! 93: C eax size
! 94: C ebx
! 95: C ecx carry
! 96: C edx
! 97: C esi src
! 98: C edi
! 99: C ebp
! 100:
! 101: pushl %esi FRAME_pushl()
! 102: pushl %edi FRAME_pushl()
! 103:
! 104: movl %edx, %esi C src
! 105: movl PARAM_DST, %edi
! 106:
! 107: movl PARAM_MULTIPLIER, %eax
! 108: pushl %ebx FRAME_pushl()
! 109:
! 110: leal (%esi,%ecx,4), %esi C src end
! 111: leal (%edi,%ecx,4), %edi C dst end
! 112:
! 113: negl %ecx C -size
! 114:
! 115: pushl %ebp FRAME_pushl()
! 116: cmpl $65536, %eax
! 117:
! 118: jb L(small)
! 119:
! 120:
! 121: L(big):
! 122: xorl %ebx, %ebx C carry limb
! 123: sarl %ecx C -size/2
! 124:
! 125: jnc L(top) C with carry flag clear
! 126:
! 127:
! 128: C size was odd, process one limb separately
! 129:
! 130: mull 4(%esi,%ecx,8) C m * src[0]
! 131:
! 132: movl %eax, 4(%edi,%ecx,8)
! 133: incl %ecx
! 134:
! 135: orl %edx, %ebx C carry limb, and clear carry flag
! 136:
! 137:
! 138: L(top):
! 139: C eax
! 140: C ebx carry
! 141: C ecx counter, negative
! 142: C edx
! 143: C esi src end
! 144: C edi dst end
! 145: C ebp (scratch carry)
! 146:
! 147: adcl $0, %ebx
! 148: movl (%esi,%ecx,8), %eax
! 149:
! 150: mull PARAM_MULTIPLIER
! 151:
! 152: movl %edx, %ebp
! 153: addl %eax, %ebx
! 154:
! 155: adcl $0, %ebp
! 156: movl 4(%esi,%ecx,8), %eax
! 157:
! 158: mull PARAM_MULTIPLIER
! 159:
! 160: movl %ebx, (%edi,%ecx,8)
! 161: addl %ebp, %eax
! 162:
! 163: movl %eax, 4(%edi,%ecx,8)
! 164: incl %ecx
! 165:
! 166: movl %edx, %ebx
! 167: jnz L(top)
! 168:
! 169:
! 170: adcl $0, %ebx
! 171: popl %ebp
! 172:
! 173: movl %ebx, %eax
! 174: popl %ebx
! 175:
! 176: popl %edi
! 177: popl %esi
! 178:
! 179: ret
! 180:
! 181:
! 182: L(small):
! 183: C Special case for 16-bit multiplier.
! 184: C
! 185: C eax multiplier
! 186: C ebx
! 187: C ecx -size
! 188: C edx src
! 189: C esi src end
! 190: C edi dst end
! 191: C ebp multiplier
! 192:
! 193: C size<3 not supported here. At size==3 we're already a couple of
! 194: C cycles faster, so there's no threshold as such, just use the MMX
! 195: C as soon as possible.
! 196:
! 197: cmpl $-3, %ecx
! 198: ja L(big)
! 199:
! 200: movd %eax, %mm7 C m
! 201: pxor %mm6, %mm6 C initial carry word
! 202:
! 203: punpcklwd %mm7, %mm7 C m replicated 2 times
! 204: addl $2, %ecx C -size+2
! 205:
! 206: punpckldq %mm7, %mm7 C m replicated 4 times
! 207: andl $4, %edx C test alignment, clear carry flag
! 208:
! 209: movq %mm7, %mm0 C m
! 210: jz L(small_entry)
! 211:
! 212:
! 213: C Source is unaligned, process one limb separately.
! 214: C
! 215: C Plain integer code is used here, since it's smaller and is about
! 216: C the same 13 cycles as an mmx block would be.
! 217: C
! 218: C An "addl $1,%ecx" doesn't clear the carry flag when size==3, hence
! 219: C the use of separate incl and orl.
! 220:
! 221: mull -8(%esi,%ecx,4) C m * src[0]
! 222:
! 223: movl %eax, -8(%edi,%ecx,4) C dst[0]
! 224: incl %ecx C one limb processed
! 225:
! 226: movd %edx, %mm6 C initial carry
! 227:
! 228: orl %eax, %eax C clear carry flag
! 229: jmp L(small_entry)
! 230:
! 231:
! 232: C The scheduling here is quite tricky, since so many instructions have
! 233: C pairing restrictions. In particular the js won't pair with a movd, and
! 234: C can't be paired with an adc since it wants flags from the inc, so
! 235: C instructions are rotated to the top of the loop to find somewhere useful
! 236: C for it.
! 237: C
! 238: C Trouble has been taken to avoid overlapping successive loop iterations,
! 239: C since that would greatly increase the size of the startup and finishup
! 240: C code. Actually there's probably not much advantage to be had from
! 241: C overlapping anyway, since the difficulties are mostly with pairing, not
! 242: C with latencies as such.
! 243: C
! 244: C In the comments x represents the src data and m the multiplier (16
! 245: C bits, but replicated 4 times).
! 246: C
! 247: C The m signs calculated in %mm3 are a loop invariant and could be held in
! 248: C say %mm5, but that would save only one instruction and hence be no faster.
! 249:
! 250: L(small_top):
! 251: C eax l.low, then l.high
! 252: C ebx (h.low)
! 253: C ecx counter, -size+2 to 0 or 1
! 254: C edx (h.high)
! 255: C esi &src[size]
! 256: C edi &dst[size]
! 257: C ebp
! 258: C
! 259: C %mm0 (high products)
! 260: C %mm1 (low products)
! 261: C %mm2 (adjust for m using x signs)
! 262: C %mm3 (adjust for x using m signs)
! 263: C %mm4
! 264: C %mm5
! 265: C %mm6 h.low, then carry
! 266: C %mm7 m replicated 4 times
! 267:
! 268: movd %mm6, %ebx C h.low
! 269: psrlq $32, %mm1 C l.high
! 270:
! 271: movd %mm0, %edx C h.high
! 272: movq %mm0, %mm6 C new c
! 273:
! 274: adcl %eax, %ebx
! 275: incl %ecx
! 276:
! 277: movd %mm1, %eax C l.high
! 278: movq %mm7, %mm0
! 279:
! 280: adcl %eax, %edx
! 281: movl %ebx, -16(%edi,%ecx,4)
! 282:
! 283: movl %edx, -12(%edi,%ecx,4)
! 284: psrlq $32, %mm6 C c
! 285:
! 286: L(small_entry):
! 287: pmulhw -8(%esi,%ecx,4), %mm0 C h = (x*m).high
! 288: movq %mm7, %mm1
! 289:
! 290: pmullw -8(%esi,%ecx,4), %mm1 C l = (x*m).low
! 291: movq %mm7, %mm3
! 292:
! 293: movq -8(%esi,%ecx,4), %mm2 C x
! 294: psraw $15, %mm3 C m signs
! 295:
! 296: pand -8(%esi,%ecx,4), %mm3 C x selected by m signs
! 297: psraw $15, %mm2 C x signs
! 298:
! 299: paddw %mm3, %mm0 C add x to h if m neg
! 300: pand %mm7, %mm2 C m selected by x signs
! 301:
! 302: paddw %mm2, %mm0 C add m to h if x neg
! 303: incl %ecx
! 304:
! 305: movd %mm1, %eax C l.low
! 306: punpcklwd %mm0, %mm6 C c + h.low << 16
! 307:
! 308: psrlq $16, %mm0 C h.high
! 309: js L(small_top)
! 310:
! 311:
! 312:
! 313:
! 314: movd %mm6, %ebx C h.low
! 315: psrlq $32, %mm1 C l.high
! 316:
! 317: adcl %eax, %ebx
! 318: popl %ebp FRAME_popl()
! 319:
! 320: movd %mm0, %edx C h.high
! 321: psrlq $32, %mm0 C l.high
! 322:
! 323: movd %mm1, %eax C l.high
! 324:
! 325: adcl %eax, %edx
! 326: movl %ebx, -12(%edi,%ecx,4)
! 327:
! 328: movd %mm0, %eax C c
! 329:
! 330: adcl $0, %eax
! 331: movl %edx, -8(%edi,%ecx,4)
! 332:
! 333: orl %ecx, %ecx
! 334: jnz L(small_done) C final %ecx==1 means even, ==0 odd
! 335:
! 336:
! 337: C Size odd, one extra limb to process.
! 338: C Plain integer code is used here, since it's smaller and is about
! 339: C the same speed as another mmx block would be.
! 340:
! 341: movl %eax, %ecx
! 342: movl PARAM_MULTIPLIER, %eax
! 343:
! 344: mull -4(%esi)
! 345:
! 346: addl %ecx, %eax
! 347:
! 348: adcl $0, %edx
! 349: movl %eax, -4(%edi)
! 350:
! 351: movl %edx, %eax
! 352: L(small_done):
! 353: popl %ebx
! 354:
! 355: popl %edi
! 356: popl %esi
! 357:
! 358: emms
! 359:
! 360: ret
! 361:
! 362: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>