version 1.1, 2000/09/09 14:12:38 |
version 1.1.1.2, 2003/08/25 16:06:24 |
|
|
dnl PowerPC-32 mpn_addmul_1 -- Multiply a limb vector with a limb and add |
dnl PowerPC-32 mpn_addmul_1 -- Multiply a limb vector with a limb and add |
dnl the result to a second limb vector. |
dnl the result to a second limb vector. |
|
|
dnl Copyright (C) 1995, 1997, 1998, 2000 Free Software Foundation, Inc. |
dnl Copyright 1995, 1997, 1998, 2000, 2001, 2002 Free Software Foundation, Inc. |
|
|
dnl This file is part of the GNU MP Library. |
dnl This file is part of the GNU MP Library. |
|
|
|
|
dnl size r5 |
dnl size r5 |
dnl s2_limb r6 |
dnl s2_limb r6 |
|
|
dnl This is optimized for the PPC604. It has not been tested on PPC601, PPC603 |
dnl This is optimized for the PPC604. It has not been tuned for PPC601, |
dnl or PPC750 since I don't have access to any such machines. |
dnl PPC603, PPC750 (G3), 7400 (G4), 7450 (newer G4). |
|
dnl |
|
dnl Loop Analysis for the 604: |
|
dnl 12 mem insn |
|
dnl 8 serializing insn |
|
dnl 8 int multiply |
|
dnl 25 int reg write |
|
dnl 9 int ops (8 of which serialize) |
|
dnl |
|
dnl The multiply insns need 16 cycles/4limb. |
|
dnl The integer register writes will need 13 cycles/4limb. |
|
dnl All-in-all, it should be possible to get to 4 cycles/limb, |
|
dnl but that will require some clever FPNOPS and BNOPS for exact |
|
dnl issue control. |
|
|
include(`../config.m4') |
include(`../config.m4') |
|
|
ASM_START() |
ASM_START() |
PROLOGUE(mpn_addmul_1) |
PROLOGUE(mpn_addmul_1) |
cmpi cr0,r5,9 C more than 9 limbs? |
cmpi cr0,r5,9 C more than 9 limbs? |
bgt cr0,.Lbig C branch if more than 9 limbs |
bgt cr0,L(big) C branch if more than 9 limbs |
|
|
mtctr r5 |
mtctr r5 |
lwz r0,0(r4) |
lwz r0,0(r4) |
Line 44 PROLOGUE(mpn_addmul_1) |
|
Line 57 PROLOGUE(mpn_addmul_1) |
|
lwz r9,0(r3) |
lwz r9,0(r3) |
addc r8,r7,r9 |
addc r8,r7,r9 |
addi r3,r3,-4 |
addi r3,r3,-4 |
bdz .Lend |
bdz L(end) |
.Lloop: |
L(loop): |
lwzu r0,4(r4) |
lwzu r0,4(r4) |
stwu r8,4(r3) |
stwu r8,4(r3) |
mullw r8,r0,r6 |
mullw r8,r0,r6 |
Line 54 PROLOGUE(mpn_addmul_1) |
|
Line 67 PROLOGUE(mpn_addmul_1) |
|
lwz r9,4(r3) |
lwz r9,4(r3) |
addze r10,r10 |
addze r10,r10 |
addc r8,r7,r9 |
addc r8,r7,r9 |
bdnz .Lloop |
bdnz L(loop) |
.Lend: stw r8,4(r3) |
L(end): stw r8,4(r3) |
addze r3,r10 |
addze r3,r10 |
blr |
blr |
|
|
.Lbig: stmw r30,-32(r1) |
L(big): stmw r30,-32(r1) |
addi r5,r5,-1 |
addi r5,r5,-1 |
srwi r0,r5,2 |
srwi r0,r5,2 |
mtctr r0 |
mtctr r0 |
Line 71 PROLOGUE(mpn_addmul_1) |
|
Line 84 PROLOGUE(mpn_addmul_1) |
|
addc r8,r8,r7 |
addc r8,r8,r7 |
stw r8,0(r3) |
stw r8,0(r3) |
|
|
.LloopU: |
L(loopU): |
lwz r7,4(r4) |
lwz r7,4(r4) |
lwz r12,8(r4) |
lwz r12,8(r4) |
lwz r30,12(r4) |
lwz r30,12(r4) |
Line 101 PROLOGUE(mpn_addmul_1) |
|
Line 114 PROLOGUE(mpn_addmul_1) |
|
stw r10,12(r3) |
stw r10,12(r3) |
adde r11,r11,r31 |
adde r11,r11,r31 |
stwu r11,16(r3) |
stwu r11,16(r3) |
bdnz .LloopU |
bdnz L(loopU) |
|
|
andi. r31,r5,3 |
andi. r31,r5,3 |
mtctr r31 |
mtctr r31 |
beq cr0,.Lendx |
beq cr0,L(endx) |
|
|
.LloopE: |
L(loopE): |
lwzu r7,4(r4) |
lwzu r7,4(r4) |
mullw r8,r7,r6 |
mullw r8,r7,r6 |
adde r8,r8,r0 C add cy_limb |
adde r8,r8,r0 C add cy_limb |
Line 116 PROLOGUE(mpn_addmul_1) |
|
Line 129 PROLOGUE(mpn_addmul_1) |
|
addze r0,r0 C new cy_limb |
addze r0,r0 C new cy_limb |
addc r8,r8,r7 |
addc r8,r8,r7 |
stwu r8,4(r3) |
stwu r8,4(r3) |
bdnz .LloopE |
bdnz L(loopE) |
.Lendx: |
L(endx): |
addze r3,r0 |
addze r3,r0 |
lmw r30,-32(r1) |
lmw r30,-32(r1) |
blr |
blr |