Annotation of OpenXM_contrib/gmp/mpn/x86/pentium/mmx/rshift.asm, Revision 1.1
1.1 ! maekawa 1: dnl Intel P5 mpn_rshift -- mpn right shift.
! 2: dnl
! 3: dnl P5: 1.75 cycles/limb.
! 4:
! 5:
! 6: dnl Copyright (C) 2000 Free Software Foundation, Inc.
! 7: dnl
! 8: dnl This file is part of the GNU MP Library.
! 9: dnl
! 10: dnl The GNU MP Library is free software; you can redistribute it and/or
! 11: dnl modify it under the terms of the GNU Lesser General Public License as
! 12: dnl published by the Free Software Foundation; either version 2.1 of the
! 13: dnl License, or (at your option) any later version.
! 14: dnl
! 15: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 16: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 17: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 18: dnl Lesser General Public License for more details.
! 19: dnl
! 20: dnl You should have received a copy of the GNU Lesser General Public
! 21: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 22: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 23: dnl Suite 330, Boston, MA 02111-1307, USA.
! 24:
! 25:
! 26: include(`../config.m4')
! 27:
! 28:
! 29: C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
! 30: C unsigned shift);
! 31: C
! 32: C Shift src,size right by shift many bits and store the result in dst,size.
! 33: C Zeros are shifted in at the left. Return the bits shifted out at the
! 34: C right.
! 35: C
! 36: C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb,
! 37: C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l.
! 38: C
! 39: C Full speed depends on source and destination being aligned. Unaligned mmx
! 40: C loads and stores on P5 don't pair and have a 2 cycle penalty. Some hairy
! 41: C setups and finish-ups are done to ensure alignment for the loop.
! 42: C
! 43: C MMX shifts work out a bit faster even for the simple loop.
! 44:
! 45: defframe(PARAM_SHIFT,16)
! 46: defframe(PARAM_SIZE, 12)
! 47: defframe(PARAM_SRC, 8)
! 48: defframe(PARAM_DST, 4)
! 49: deflit(`FRAME',0)
! 50:
! 51: dnl Minimum 5, because the unrolled loop can't handle less.
! 52: deflit(UNROLL_THRESHOLD, 5)
! 53:
! 54: .text
! 55: ALIGN(8)
! 56:
! 57: PROLOGUE(mpn_rshift)
! 58:
! 59: pushl %ebx
! 60: pushl %edi
! 61: deflit(`FRAME',8)
! 62:
! 63: movl PARAM_SIZE, %eax
! 64: movl PARAM_DST, %edx
! 65:
! 66: movl PARAM_SRC, %ebx
! 67: movl PARAM_SHIFT, %ecx
! 68:
! 69: cmp $UNROLL_THRESHOLD, %eax
! 70: jae L(unroll)
! 71:
! 72: decl %eax
! 73: movl (%ebx), %edi C src low limb
! 74:
! 75: jnz L(simple)
! 76:
! 77: shrdl( %cl, %edi, %eax) C eax was decremented to zero
! 78:
! 79: shrl %cl, %edi
! 80:
! 81: movl %edi, (%edx) C dst low limb
! 82: popl %edi C risk of data cache bank clash
! 83:
! 84: popl %ebx
! 85:
! 86: ret
! 87:
! 88:
! 89: C -----------------------------------------------------------------------------
! 90: ALIGN(8)
! 91: L(simple):
! 92: C eax size-1
! 93: C ebx src
! 94: C ecx shift
! 95: C edx dst
! 96: C esi
! 97: C edi
! 98: C ebp
! 99: deflit(`FRAME',8)
! 100:
! 101: movd (%ebx), %mm5 C src[0]
! 102: leal (%ebx,%eax,4), %ebx C &src[size-1]
! 103:
! 104: movd %ecx, %mm6 C rshift
! 105: leal -4(%edx,%eax,4), %edx C &dst[size-2]
! 106:
! 107: psllq $32, %mm5
! 108: negl %eax
! 109:
! 110:
! 111: C This loop is 5 or 8 cycles, with every second load unaligned and a wasted
! 112: C cycle waiting for the mm0 result to be ready. For comparison a shrdl is 4
! 113: C cycles and would be 8 in a simple loop. Using mmx helps the return value
! 114: C and last limb calculations too.
! 115:
! 116: L(simple_top):
! 117: C eax counter, limbs, negative
! 118: C ebx &src[size-1]
! 119: C ecx return value
! 120: C edx &dst[size-2]
! 121: C
! 122: C mm0 scratch
! 123: C mm5 return value
! 124: C mm6 shift
! 125:
! 126: movq (%ebx,%eax,4), %mm0
! 127: incl %eax
! 128:
! 129: psrlq %mm6, %mm0
! 130:
! 131: movd %mm0, (%edx,%eax,4)
! 132: jnz L(simple_top)
! 133:
! 134:
! 135: movd (%ebx), %mm0
! 136: psrlq %mm6, %mm5 C return value
! 137:
! 138: psrlq %mm6, %mm0
! 139: popl %edi
! 140:
! 141: movd %mm5, %eax
! 142: popl %ebx
! 143:
! 144: movd %mm0, 4(%edx)
! 145:
! 146: emms
! 147:
! 148: ret
! 149:
! 150:
! 151: C -----------------------------------------------------------------------------
! 152: ALIGN(8)
! 153: L(unroll):
! 154: C eax size
! 155: C ebx src
! 156: C ecx shift
! 157: C edx dst
! 158: C esi
! 159: C edi
! 160: C ebp
! 161: deflit(`FRAME',8)
! 162:
! 163: movd (%ebx), %mm5 C src[0]
! 164: movl $4, %edi
! 165:
! 166: movd %ecx, %mm6 C rshift
! 167: testl %edi, %ebx
! 168:
! 169: psllq $32, %mm5
! 170: jz L(start_src_aligned)
! 171:
! 172:
! 173: C src isn't aligned, process low limb separately (marked xxx) and
! 174: C step src and dst by one limb, making src aligned.
! 175: C
! 176: C source ebx
! 177: C --+-------+-------+-------+
! 178: C | xxx |
! 179: C --+-------+-------+-------+
! 180: C 4mod8 0mod8 4mod8
! 181: C
! 182: C dest edx
! 183: C --+-------+-------+
! 184: C | | xxx |
! 185: C --+-------+-------+
! 186:
! 187: movq (%ebx), %mm0 C unaligned load
! 188:
! 189: psrlq %mm6, %mm0
! 190: addl $4, %ebx
! 191:
! 192: decl %eax
! 193:
! 194: movd %mm0, (%edx)
! 195: addl $4, %edx
! 196: L(start_src_aligned):
! 197:
! 198:
! 199: movq (%ebx), %mm1
! 200: testl %edi, %edx
! 201:
! 202: psrlq %mm6, %mm5 C retval
! 203: jz L(start_dst_aligned)
! 204:
! 205: C dst isn't aligned, add 4 to make it so, and pretend the shift is
! 206: C 32 bits extra. Low limb of dst (marked xxx) handled here
! 207: C separately.
! 208: C
! 209: C source ebx
! 210: C --+-------+-------+
! 211: C | mm1 |
! 212: C --+-------+-------+
! 213: C 4mod8 0mod8
! 214: C
! 215: C dest edx
! 216: C --+-------+-------+-------+
! 217: C | xxx |
! 218: C --+-------+-------+-------+
! 219: C 4mod8 0mod8 4mod8
! 220:
! 221: movq %mm1, %mm0
! 222: addl $32, %ecx C new shift
! 223:
! 224: psrlq %mm6, %mm0
! 225:
! 226: movd %ecx, %mm6
! 227:
! 228: movd %mm0, (%edx)
! 229: addl $4, %edx
! 230: L(start_dst_aligned):
! 231:
! 232:
! 233: movq 8(%ebx), %mm3
! 234: negl %ecx
! 235:
! 236: movq %mm3, %mm2 C mm2 src qword
! 237: addl $64, %ecx
! 238:
! 239: movd %ecx, %mm7
! 240: psrlq %mm6, %mm1
! 241:
! 242: leal -12(%ebx,%eax,4), %ebx
! 243: leal -20(%edx,%eax,4), %edx
! 244:
! 245: psllq %mm7, %mm3
! 246: subl $7, %eax C size-7
! 247:
! 248: por %mm1, %mm3 C mm3 ready to store
! 249: negl %eax C -(size-7)
! 250:
! 251: jns L(finish)
! 252:
! 253:
! 254: C This loop is the important bit, the rest is just support. Careful
! 255: C instruction scheduling achieves the claimed 1.75 c/l. The
! 256: C relevant parts of the pairing rules are:
! 257: C
! 258: C - mmx loads and stores execute only in the U pipe
! 259: C - only one mmx shift in a pair
! 260: C - wait one cycle before storing an mmx register result
! 261: C - the usual address generation interlock
! 262: C
! 263: C Two qword calculations are slightly interleaved. The instructions
! 264: C marked "C" belong to the second qword, and the "C prev" one is for
! 265: C the second qword from the previous iteration.
! 266:
! 267: ALIGN(8)
! 268: L(unroll_loop):
! 269: C eax counter, limbs, negative
! 270: C ebx &src[size-12]
! 271: C ecx
! 272: C edx &dst[size-12]
! 273: C esi
! 274: C edi
! 275: C
! 276: C mm0
! 277: C mm1
! 278: C mm2 src qword from -8(%ebx,%eax,4)
! 279: C mm3 dst qword ready to store to -8(%edx,%eax,4)
! 280: C
! 281: C mm5 return value
! 282: C mm6 rshift
! 283: C mm7 lshift
! 284:
! 285: movq (%ebx,%eax,4), %mm0
! 286: psrlq %mm6, %mm2
! 287:
! 288: movq %mm0, %mm1
! 289: psllq %mm7, %mm0
! 290:
! 291: movq %mm3, -8(%edx,%eax,4) C prev
! 292: por %mm2, %mm0
! 293:
! 294: movq 8(%ebx,%eax,4), %mm3 C
! 295: psrlq %mm6, %mm1 C
! 296:
! 297: movq %mm0, (%edx,%eax,4)
! 298: movq %mm3, %mm2 C
! 299:
! 300: psllq %mm7, %mm3 C
! 301: addl $4, %eax
! 302:
! 303: por %mm1, %mm3 C
! 304: js L(unroll_loop)
! 305:
! 306:
! 307: L(finish):
! 308: C eax 0 to 3 representing respectively 3 to 0 limbs remaining
! 309:
! 310: testb $2, %al
! 311:
! 312: jnz L(finish_no_two)
! 313:
! 314: movq (%ebx,%eax,4), %mm0
! 315: psrlq %mm6, %mm2
! 316:
! 317: movq %mm0, %mm1
! 318: psllq %mm7, %mm0
! 319:
! 320: movq %mm3, -8(%edx,%eax,4) C prev
! 321: por %mm2, %mm0
! 322:
! 323: movq %mm1, %mm2
! 324: movq %mm0, %mm3
! 325:
! 326: addl $2, %eax
! 327: L(finish_no_two):
! 328:
! 329:
! 330: C eax 2 or 3 representing respectively 1 or 0 limbs remaining
! 331: C
! 332: C mm2 src prev qword, from -8(%ebx,%eax,4)
! 333: C mm3 dst qword, for -8(%edx,%eax,4)
! 334:
! 335: testb $1, %al
! 336: popl %edi
! 337:
! 338: movd %mm5, %eax C retval
! 339: jnz L(finish_zero)
! 340:
! 341:
! 342: C One extra limb, destination was aligned.
! 343: C
! 344: C source ebx
! 345: C +-------+---------------+--
! 346: C | | mm2 |
! 347: C +-------+---------------+--
! 348: C
! 349: C dest edx
! 350: C +-------+---------------+---------------+--
! 351: C | | | mm3 |
! 352: C +-------+---------------+---------------+--
! 353: C
! 354: C mm6 = shift
! 355: C mm7 = ecx = 64-shift
! 356:
! 357:
! 358: C One extra limb, destination was unaligned.
! 359: C
! 360: C source ebx
! 361: C +-------+---------------+--
! 362: C | | mm2 |
! 363: C +-------+---------------+--
! 364: C
! 365: C dest edx
! 366: C +---------------+---------------+--
! 367: C | | mm3 |
! 368: C +---------------+---------------+--
! 369: C
! 370: C mm6 = shift+32
! 371: C mm7 = ecx = 64-(shift+32)
! 372:
! 373:
! 374: C In both cases there's one extra limb of src to fetch and combine
! 375: C with mm2 to make a qword at 8(%edx), and in the aligned case
! 376: C there's a further extra limb of dst to be formed.
! 377:
! 378:
! 379: movd 8(%ebx), %mm0
! 380: psrlq %mm6, %mm2
! 381:
! 382: movq %mm0, %mm1
! 383: psllq %mm7, %mm0
! 384:
! 385: movq %mm3, (%edx)
! 386: por %mm2, %mm0
! 387:
! 388: psrlq %mm6, %mm1
! 389: andl $32, %ecx
! 390:
! 391: popl %ebx
! 392: jz L(finish_one_unaligned)
! 393:
! 394: C dst was aligned, must store one extra limb
! 395: movd %mm1, 16(%edx)
! 396: L(finish_one_unaligned):
! 397:
! 398: movq %mm0, 8(%edx)
! 399:
! 400: emms
! 401:
! 402: ret
! 403:
! 404:
! 405: L(finish_zero):
! 406:
! 407: C No extra limbs, destination was aligned.
! 408: C
! 409: C source ebx
! 410: C +---------------+--
! 411: C | mm2 |
! 412: C +---------------+--
! 413: C
! 414: C dest edx+4
! 415: C +---------------+---------------+--
! 416: C | | mm3 |
! 417: C +---------------+---------------+--
! 418: C
! 419: C mm6 = shift
! 420: C mm7 = ecx = 64-shift
! 421:
! 422:
! 423: C No extra limbs, destination was unaligned.
! 424: C
! 425: C source ebx
! 426: C +---------------+--
! 427: C | mm2 |
! 428: C +---------------+--
! 429: C
! 430: C dest edx+4
! 431: C +-------+---------------+--
! 432: C | | mm3 |
! 433: C +-------+---------------+--
! 434: C
! 435: C mm6 = shift+32
! 436: C mm7 = 64-(shift+32)
! 437:
! 438:
! 439: C The movd for the unaligned case is clearly the same data as the
! 440: C movq for the aligned case, it's just a choice between whether one
! 441: C or two limbs should be written.
! 442:
! 443:
! 444: movq %mm3, 4(%edx)
! 445: psrlq %mm6, %mm2
! 446:
! 447: movd %mm2, 12(%edx)
! 448: andl $32, %ecx
! 449:
! 450: popl %ebx
! 451: jz L(finish_zero_unaligned)
! 452:
! 453: movq %mm2, 12(%edx)
! 454: L(finish_zero_unaligned):
! 455:
! 456: emms
! 457:
! 458: ret
! 459:
! 460: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>