Annotation of OpenXM_contrib/gmp/mpn/x86/k7/mmx/lshift.asm, Revision 1.1
1.1 ! maekawa 1: dnl AMD K7 mpn_lshift -- mpn left shift.
! 2: dnl
! 3: dnl K7: 1.21 cycles/limb (at 16 limbs/loop).
! 4:
! 5:
! 6: dnl Copyright (C) 1999, 2000 Free Software Foundation, Inc.
! 7: dnl
! 8: dnl This file is part of the GNU MP Library.
! 9: dnl
! 10: dnl The GNU MP Library is free software; you can redistribute it and/or
! 11: dnl modify it under the terms of the GNU Lesser General Public License as
! 12: dnl published by the Free Software Foundation; either version 2.1 of the
! 13: dnl License, or (at your option) any later version.
! 14: dnl
! 15: dnl The GNU MP Library is distributed in the hope that it will be useful,
! 16: dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
! 17: dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
! 18: dnl Lesser General Public License for more details.
! 19: dnl
! 20: dnl You should have received a copy of the GNU Lesser General Public
! 21: dnl License along with the GNU MP Library; see the file COPYING.LIB. If
! 22: dnl not, write to the Free Software Foundation, Inc., 59 Temple Place -
! 23: dnl Suite 330, Boston, MA 02111-1307, USA.
! 24:
! 25:
! 26: include(`../config.m4')
! 27:
! 28:
! 29: dnl K7: UNROLL_COUNT cycles/limb
! 30: dnl 4 1.51
! 31: dnl 8 1.26
! 32: dnl 16 1.21
! 33: dnl 32 1.2
! 34: dnl Maximum possible with the current code is 64.
! 35:
! 36: deflit(UNROLL_COUNT, 16)
! 37:
! 38:
! 39: C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
! 40: C unsigned shift);
! 41: C
! 42: C Shift src,size left by shift many bits and store the result in dst,size.
! 43: C Zeros are shifted in at the right. The bits shifted out at the left are
! 44: C the return value.
! 45: C
! 46: C The comments in mpn_rshift apply here too.
! 47:
! 48: ifdef(`PIC',`
! 49: deflit(UNROLL_THRESHOLD, 10)
! 50: ',`
! 51: deflit(UNROLL_THRESHOLD, 10)
! 52: ')
! 53:
! 54: defframe(PARAM_SHIFT,16)
! 55: defframe(PARAM_SIZE, 12)
! 56: defframe(PARAM_SRC, 8)
! 57: defframe(PARAM_DST, 4)
! 58:
! 59: defframe(SAVE_EDI, -4)
! 60: defframe(SAVE_ESI, -8)
! 61: defframe(SAVE_EBX, -12)
! 62: deflit(SAVE_SIZE, 12)
! 63:
! 64: .text
! 65: ALIGN(32)
! 66:
! 67: PROLOGUE(mpn_lshift)
! 68: deflit(`FRAME',0)
! 69:
! 70: movl PARAM_SIZE, %eax
! 71: movl PARAM_SRC, %edx
! 72: subl $SAVE_SIZE, %esp
! 73: deflit(`FRAME',SAVE_SIZE)
! 74:
! 75: movl PARAM_SHIFT, %ecx
! 76: movl %edi, SAVE_EDI
! 77:
! 78: movl PARAM_DST, %edi
! 79: decl %eax
! 80: jnz L(more_than_one_limb)
! 81:
! 82: movl (%edx), %edx
! 83:
! 84: shldl( %cl, %edx, %eax) C eax was decremented to zero
! 85:
! 86: shll %cl, %edx
! 87:
! 88: movl %edx, (%edi)
! 89: movl SAVE_EDI, %edi
! 90: addl $SAVE_SIZE, %esp
! 91:
! 92: ret
! 93:
! 94:
! 95: C -----------------------------------------------------------------------------
! 96: L(more_than_one_limb):
! 97: C eax size-1
! 98: C ebx
! 99: C ecx shift
! 100: C edx src
! 101: C esi
! 102: C edi dst
! 103: C ebp
! 104:
! 105: movd PARAM_SHIFT, %mm6
! 106: movd (%edx,%eax,4), %mm5 C src high limb
! 107: cmp $UNROLL_THRESHOLD-1, %eax
! 108:
! 109: jae L(unroll)
! 110: negl %ecx
! 111: movd (%edx), %mm4 C src low limb
! 112:
! 113: addl $32, %ecx
! 114:
! 115: movd %ecx, %mm7
! 116:
! 117: L(simple_top):
! 118: C eax loop counter, limbs
! 119: C ebx
! 120: C ecx
! 121: C edx src
! 122: C esi
! 123: C edi dst
! 124: C ebp
! 125: C
! 126: C mm0 scratch
! 127: C mm4 src low limb
! 128: C mm5 src high limb
! 129: C mm6 shift
! 130: C mm7 32-shift
! 131:
! 132: movq -4(%edx,%eax,4), %mm0
! 133: decl %eax
! 134:
! 135: psrlq %mm7, %mm0
! 136:
! 137: movd %mm0, 4(%edi,%eax,4)
! 138: jnz L(simple_top)
! 139:
! 140:
! 141: psllq %mm6, %mm5
! 142: psllq %mm6, %mm4
! 143:
! 144: psrlq $32, %mm5
! 145: movd %mm4, (%edi) C dst low limb
! 146:
! 147: movd %mm5, %eax C return value
! 148:
! 149: movl SAVE_EDI, %edi
! 150: addl $SAVE_SIZE, %esp
! 151: emms
! 152:
! 153: ret
! 154:
! 155:
! 156: C -----------------------------------------------------------------------------
! 157: ALIGN(16)
! 158: L(unroll):
! 159: C eax size-1
! 160: C ebx (saved)
! 161: C ecx shift
! 162: C edx src
! 163: C esi
! 164: C edi dst
! 165: C ebp
! 166: C
! 167: C mm5 src high limb, for return value
! 168: C mm6 lshift
! 169:
! 170: movl %esi, SAVE_ESI
! 171: movl %ebx, SAVE_EBX
! 172: leal -4(%edx,%eax,4), %edx C &src[size-2]
! 173:
! 174: testb $4, %dl
! 175: movq (%edx), %mm1 C src high qword
! 176:
! 177: jz L(start_src_aligned)
! 178:
! 179:
! 180: C src isn't aligned, process high limb (marked xxx) separately to
! 181: C make it so
! 182: C
! 183: C source -4(edx,%eax,4)
! 184: C |
! 185: C +-------+-------+-------+--
! 186: C | xxx |
! 187: C +-------+-------+-------+--
! 188: C 0mod8 4mod8 0mod8
! 189: C
! 190: C dest -4(edi,%eax,4)
! 191: C |
! 192: C +-------+-------+--
! 193: C | xxx | |
! 194: C +-------+-------+--
! 195:
! 196: psllq %mm6, %mm1
! 197: subl $4, %edx
! 198: movl %eax, PARAM_SIZE C size-1
! 199:
! 200: psrlq $32, %mm1
! 201: decl %eax C size-2 is new size-1
! 202:
! 203: movd %mm1, 4(%edi,%eax,4)
! 204: movq (%edx), %mm1 C new src high qword
! 205: L(start_src_aligned):
! 206:
! 207:
! 208: leal -4(%edi,%eax,4), %edi C &dst[size-2]
! 209: psllq %mm6, %mm5
! 210:
! 211: testl $4, %edi
! 212: psrlq $32, %mm5 C return value
! 213:
! 214: jz L(start_dst_aligned)
! 215:
! 216:
! 217: C dst isn't aligned, subtract 4 bytes to make it so, and pretend the
! 218: C shift is 32 bits extra. High limb of dst (marked xxx) handled
! 219: C here separately.
! 220: C
! 221: C source %edx
! 222: C +-------+-------+--
! 223: C | mm1 |
! 224: C +-------+-------+--
! 225: C 0mod8 4mod8
! 226: C
! 227: C dest %edi
! 228: C +-------+-------+-------+--
! 229: C | xxx |
! 230: C +-------+-------+-------+--
! 231: C 0mod8 4mod8 0mod8
! 232:
! 233: movq %mm1, %mm0
! 234: psllq %mm6, %mm1
! 235: addl $32, %ecx C shift+32
! 236:
! 237: psrlq $32, %mm1
! 238:
! 239: movd %mm1, 4(%edi)
! 240: movq %mm0, %mm1
! 241: subl $4, %edi
! 242:
! 243: movd %ecx, %mm6 C new lshift
! 244: L(start_dst_aligned):
! 245:
! 246: decl %eax C size-2, two last limbs handled at end
! 247: movq %mm1, %mm2 C copy of src high qword
! 248: negl %ecx
! 249:
! 250: andl $-2, %eax C round size down to even
! 251: addl $64, %ecx
! 252:
! 253: movl %eax, %ebx
! 254: negl %eax
! 255:
! 256: andl $UNROLL_MASK, %eax
! 257: decl %ebx
! 258:
! 259: shll %eax
! 260:
! 261: movd %ecx, %mm7 C rshift = 64-lshift
! 262:
! 263: ifdef(`PIC',`
! 264: call L(pic_calc)
! 265: L(here):
! 266: ',`
! 267: leal L(entry) (%eax,%eax,4), %esi
! 268: ')
! 269: shrl $UNROLL_LOG2, %ebx C loop counter
! 270:
! 271: leal ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx
! 272: leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
! 273: movl PARAM_SIZE, %eax C for use at end
! 274: jmp *%esi
! 275:
! 276:
! 277: ifdef(`PIC',`
! 278: L(pic_calc):
! 279: C See README.family about old gas bugs
! 280: leal (%eax,%eax,4), %esi
! 281: addl $L(entry)-L(here), %esi
! 282: addl (%esp), %esi
! 283:
! 284: ret
! 285: ')
! 286:
! 287:
! 288: C -----------------------------------------------------------------------------
! 289: ALIGN(32)
! 290: L(top):
! 291: C eax size (for use at end)
! 292: C ebx loop counter
! 293: C ecx rshift
! 294: C edx src
! 295: C esi computed jump
! 296: C edi dst
! 297: C ebp
! 298: C
! 299: C mm0 scratch
! 300: C mm1 \ carry (alternating, mm2 first)
! 301: C mm2 /
! 302: C mm6 lshift
! 303: C mm7 rshift
! 304: C
! 305: C 10 code bytes/limb
! 306: C
! 307: C The two chunks differ in whether mm1 or mm2 hold the carry.
! 308: C The computed jump puts the initial carry in both mm1 and mm2.
! 309:
! 310: L(entry):
! 311: deflit(CHUNK_COUNT, 4)
! 312: forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
! 313: deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
! 314: deflit(`disp1', eval(disp0 - 8))
! 315:
! 316: movq disp0(%edx), %mm0
! 317: psllq %mm6, %mm2
! 318:
! 319: movq %mm0, %mm1
! 320: psrlq %mm7, %mm0
! 321:
! 322: por %mm2, %mm0
! 323: movq %mm0, disp0(%edi)
! 324:
! 325:
! 326: movq disp1(%edx), %mm0
! 327: psllq %mm6, %mm1
! 328:
! 329: movq %mm0, %mm2
! 330: psrlq %mm7, %mm0
! 331:
! 332: por %mm1, %mm0
! 333: movq %mm0, disp1(%edi)
! 334: ')
! 335:
! 336: subl $UNROLL_BYTES, %edx
! 337: subl $UNROLL_BYTES, %edi
! 338: decl %ebx
! 339:
! 340: jns L(top)
! 341:
! 342:
! 343:
! 344: define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))')
! 345:
! 346: L(end):
! 347: testb $1, %al
! 348: movl SAVE_EBX, %ebx
! 349: psllq %mm6, %mm2 C wanted left shifted in all cases below
! 350:
! 351: movd %mm5, %eax
! 352:
! 353: movl SAVE_ESI, %esi
! 354: jz L(end_even)
! 355:
! 356:
! 357: L(end_odd):
! 358:
! 359: C Size odd, destination was aligned.
! 360: C
! 361: C source edx+8 edx+4
! 362: C --+---------------+-------+
! 363: C | mm2 | |
! 364: C --+---------------+-------+
! 365: C
! 366: C dest edi
! 367: C --+---------------+---------------+-------+
! 368: C | written | | |
! 369: C --+---------------+---------------+-------+
! 370: C
! 371: C mm6 = shift
! 372: C mm7 = ecx = 64-shift
! 373:
! 374:
! 375: C Size odd, destination was unaligned.
! 376: C
! 377: C source edx+8 edx+4
! 378: C --+---------------+-------+
! 379: C | mm2 | |
! 380: C --+---------------+-------+
! 381: C
! 382: C dest edi
! 383: C --+---------------+---------------+
! 384: C | written | |
! 385: C --+---------------+---------------+
! 386: C
! 387: C mm6 = shift+32
! 388: C mm7 = ecx = 64-(shift+32)
! 389:
! 390:
! 391: C In both cases there's one extra limb of src to fetch and combine
! 392: C with mm2 to make a qword at (%edi), and in the aligned case
! 393: C there's an extra limb of dst to be formed from that extra src limb
! 394: C left shifted.
! 395:
! 396: movd disp(4) (%edx), %mm0
! 397: testb $32, %cl
! 398:
! 399: movq %mm0, %mm1
! 400: psllq $32, %mm0
! 401:
! 402: psrlq %mm7, %mm0
! 403: psllq %mm6, %mm1
! 404:
! 405: por %mm2, %mm0
! 406:
! 407: movq %mm0, disp(0) (%edi)
! 408: jz L(end_odd_unaligned)
! 409: movd %mm1, disp(-4) (%edi)
! 410: L(end_odd_unaligned):
! 411:
! 412: movl SAVE_EDI, %edi
! 413: addl $SAVE_SIZE, %esp
! 414: emms
! 415:
! 416: ret
! 417:
! 418:
! 419: L(end_even):
! 420:
! 421: C Size even, destination was aligned.
! 422: C
! 423: C source edx+8
! 424: C --+---------------+
! 425: C | mm2 |
! 426: C --+---------------+
! 427: C
! 428: C dest edi
! 429: C --+---------------+---------------+
! 430: C | written | |
! 431: C --+---------------+---------------+
! 432: C
! 433: C mm6 = shift
! 434: C mm7 = ecx = 64-shift
! 435:
! 436:
! 437: C Size even, destination was unaligned.
! 438: C
! 439: C source edx+8
! 440: C --+---------------+
! 441: C | mm2 |
! 442: C --+---------------+
! 443: C
! 444: C dest edi+4
! 445: C --+---------------+-------+
! 446: C | written | |
! 447: C --+---------------+-------+
! 448: C
! 449: C mm6 = shift+32
! 450: C mm7 = ecx = 64-(shift+32)
! 451:
! 452:
! 453: C The movq for the aligned case overwrites the movd for the
! 454: C unaligned case.
! 455:
! 456: movq %mm2, %mm0
! 457: psrlq $32, %mm2
! 458:
! 459: testb $32, %cl
! 460: movd %mm2, disp(4) (%edi)
! 461:
! 462: jz L(end_even_unaligned)
! 463: movq %mm0, disp(0) (%edi)
! 464: L(end_even_unaligned):
! 465:
! 466: movl SAVE_EDI, %edi
! 467: addl $SAVE_SIZE, %esp
! 468: emms
! 469:
! 470: ret
! 471:
! 472: EPILOGUE()
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>