=================================================================== RCS file: /home/cvs/OpenXM_contrib/gmp/mpn/powerpc32/Attic/addmul_1.asm,v retrieving revision 1.1.1.1 retrieving revision 1.1.1.2 diff -u -p -r1.1.1.1 -r1.1.1.2 --- OpenXM_contrib/gmp/mpn/powerpc32/Attic/addmul_1.asm 2000/09/09 14:12:38 1.1.1.1 +++ OpenXM_contrib/gmp/mpn/powerpc32/Attic/addmul_1.asm 2003/08/25 16:06:24 1.1.1.2 @@ -1,7 +1,7 @@ dnl PowerPC-32 mpn_addmul_1 -- Multiply a limb vector with a limb and add dnl the result to a second limb vector. -dnl Copyright (C) 1995, 1997, 1998, 2000 Free Software Foundation, Inc. +dnl Copyright 1995, 1997, 1998, 2000, 2001, 2002 Free Software Foundation, Inc. dnl This file is part of the GNU MP Library. @@ -27,15 +27,28 @@ dnl s1_ptr r4 dnl size r5 dnl s2_limb r6 -dnl This is optimized for the PPC604. It has not been tested on PPC601, PPC603 -dnl or PPC750 since I don't have access to any such machines. +dnl This is optimized for the PPC604. It has not been tuned for PPC601, +dnl PPC603, PPC750 (G3), 7400 (G4), 7450 (newer G4). +dnl +dnl Loop Analysis for the 604: +dnl 12 mem insn +dnl 8 serializing insn +dnl 8 int multiply +dnl 25 int reg write +dnl 9 int ops (8 of which serialize) +dnl +dnl The multiply insns need 16 cycles/4limb. +dnl The integer register writes will need 13 cycles/4limb. +dnl All-in-all, it should be possible to get to 4 cycles/limb, +dnl but that will require some clever FPNOPS and BNOPS for exact +dnl issue control. include(`../config.m4') ASM_START() PROLOGUE(mpn_addmul_1) cmpi cr0,r5,9 C more than 9 limbs? - bgt cr0,.Lbig C branch if more than 9 limbs + bgt cr0,L(big) C branch if more than 9 limbs mtctr r5 lwz r0,0(r4) @@ -44,8 +57,8 @@ PROLOGUE(mpn_addmul_1) lwz r9,0(r3) addc r8,r7,r9 addi r3,r3,-4 - bdz .Lend -.Lloop: + bdz L(end) +L(loop): lwzu r0,4(r4) stwu r8,4(r3) mullw r8,r0,r6 @@ -54,12 +67,12 @@ PROLOGUE(mpn_addmul_1) lwz r9,4(r3) addze r10,r10 addc r8,r7,r9 - bdnz .Lloop -.Lend: stw r8,4(r3) + bdnz L(loop) +L(end): stw r8,4(r3) addze r3,r10 blr -.Lbig: stmw r30,-32(r1) +L(big): stmw r30,-32(r1) addi r5,r5,-1 srwi r0,r5,2 mtctr r0 @@ -71,7 +84,7 @@ PROLOGUE(mpn_addmul_1) addc r8,r8,r7 stw r8,0(r3) -.LloopU: +L(loopU): lwz r7,4(r4) lwz r12,8(r4) lwz r30,12(r4) @@ -101,13 +114,13 @@ PROLOGUE(mpn_addmul_1) stw r10,12(r3) adde r11,r11,r31 stwu r11,16(r3) - bdnz .LloopU + bdnz L(loopU) andi. r31,r5,3 mtctr r31 - beq cr0,.Lendx + beq cr0,L(endx) -.LloopE: +L(loopE): lwzu r7,4(r4) mullw r8,r7,r6 adde r8,r8,r0 C add cy_limb @@ -116,8 +129,8 @@ PROLOGUE(mpn_addmul_1) addze r0,r0 C new cy_limb addc r8,r8,r7 stwu r8,4(r3) - bdnz .LloopE -.Lendx: + bdnz L(loopE) +L(endx): addze r3,r0 lmw r30,-32(r1) blr