Annotation of OpenXM_contrib/gmp/mpn/powerpc32/addmul_1.asm, Revision 1.1.1.2
1.1 maekawa 1: dnl PowerPC-32 mpn_addmul_1 -- Multiply a limb vector with a limb and add
2: dnl the result to a second limb vector.
3:
1.1.1.2 ! ohara 4: dnl Copyright 1995, 1997, 1998, 2000, 2001, 2002 Free Software Foundation, Inc.
1.1 maekawa 5:
6: dnl This file is part of the GNU MP Library.
7:
8: dnl The GNU MP Library is free software; you can redistribute it and/or modify
9: dnl it under the terms of the GNU Lesser General Public License as published by
10: dnl the Free Software Foundation; either version 2.1 of the License, or (at your
11: dnl option) any later version.
12:
13: dnl The GNU MP Library is distributed in the hope that it will be useful, but
14: dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15: dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16: dnl License for more details.
17:
18: dnl You should have received a copy of the GNU Lesser General Public License
19: dnl along with the GNU MP Library; see the file COPYING.LIB. If not, write to
20: dnl the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
21: dnl MA 02111-1307, USA.
22:
23:
24: dnl INPUT PARAMETERS
25: dnl res_ptr r3
26: dnl s1_ptr r4
27: dnl size r5
28: dnl s2_limb r6
29:
1.1.1.2 ! ohara 30: dnl This is optimized for the PPC604. It has not been tuned for PPC601,
! 31: dnl PPC603, PPC750 (G3), 7400 (G4), 7450 (newer G4).
! 32: dnl
! 33: dnl Loop Analysis for the 604:
! 34: dnl 12 mem insn
! 35: dnl 8 serializing insn
! 36: dnl 8 int multiply
! 37: dnl 25 int reg write
! 38: dnl 9 int ops (8 of which serialize)
! 39: dnl
! 40: dnl The multiply insns need 16 cycles/4limb.
! 41: dnl The integer register writes will need 13 cycles/4limb.
! 42: dnl All-in-all, it should be possible to get to 4 cycles/limb,
! 43: dnl but that will require some clever FPNOPS and BNOPS for exact
! 44: dnl issue control.
1.1 maekawa 45:
46: include(`../config.m4')
47:
48: ASM_START()
49: PROLOGUE(mpn_addmul_1)
50: cmpi cr0,r5,9 C more than 9 limbs?
1.1.1.2 ! ohara 51: bgt cr0,L(big) C branch if more than 9 limbs
1.1 maekawa 52:
53: mtctr r5
54: lwz r0,0(r4)
55: mullw r7,r0,r6
56: mulhwu r10,r0,r6
57: lwz r9,0(r3)
58: addc r8,r7,r9
59: addi r3,r3,-4
1.1.1.2 ! ohara 60: bdz L(end)
! 61: L(loop):
1.1 maekawa 62: lwzu r0,4(r4)
63: stwu r8,4(r3)
64: mullw r8,r0,r6
65: adde r7,r8,r10
66: mulhwu r10,r0,r6
67: lwz r9,4(r3)
68: addze r10,r10
69: addc r8,r7,r9
1.1.1.2 ! ohara 70: bdnz L(loop)
! 71: L(end): stw r8,4(r3)
1.1 maekawa 72: addze r3,r10
73: blr
74:
1.1.1.2 ! ohara 75: L(big): stmw r30,-32(r1)
1.1 maekawa 76: addi r5,r5,-1
77: srwi r0,r5,2
78: mtctr r0
79:
80: lwz r7,0(r4)
81: mullw r8,r7,r6
82: mulhwu r0,r7,r6
83: lwz r7,0(r3)
84: addc r8,r8,r7
85: stw r8,0(r3)
86:
1.1.1.2 ! ohara 87: L(loopU):
1.1 maekawa 88: lwz r7,4(r4)
89: lwz r12,8(r4)
90: lwz r30,12(r4)
91: lwzu r31,16(r4)
92: mullw r8,r7,r6
93: mullw r9,r12,r6
94: mullw r10,r30,r6
95: mullw r11,r31,r6
96: adde r8,r8,r0 C add cy_limb
97: mulhwu r0,r7,r6
98: lwz r7,4(r3)
99: adde r9,r9,r0
100: mulhwu r0,r12,r6
101: lwz r12,8(r3)
102: adde r10,r10,r0
103: mulhwu r0,r30,r6
104: lwz r30,12(r3)
105: adde r11,r11,r0
106: mulhwu r0,r31,r6
107: lwz r31,16(r3)
108: addze r0,r0 C new cy_limb
109: addc r8,r8,r7
110: stw r8,4(r3)
111: adde r9,r9,r12
112: stw r9,8(r3)
113: adde r10,r10,r30
114: stw r10,12(r3)
115: adde r11,r11,r31
116: stwu r11,16(r3)
1.1.1.2 ! ohara 117: bdnz L(loopU)
1.1 maekawa 118:
119: andi. r31,r5,3
120: mtctr r31
1.1.1.2 ! ohara 121: beq cr0,L(endx)
1.1 maekawa 122:
1.1.1.2 ! ohara 123: L(loopE):
1.1 maekawa 124: lwzu r7,4(r4)
125: mullw r8,r7,r6
126: adde r8,r8,r0 C add cy_limb
127: mulhwu r0,r7,r6
128: lwz r7,4(r3)
129: addze r0,r0 C new cy_limb
130: addc r8,r8,r7
131: stwu r8,4(r3)
1.1.1.2 ! ohara 132: bdnz L(loopE)
! 133: L(endx):
1.1 maekawa 134: addze r3,r0
135: lmw r30,-32(r1)
136: blr
137: EPILOGUE(mpn_addmul_1)
FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>