version 1.1, 2001/06/19 07:32:58 |
version 1.6, 2001/06/20 03:18:21 |
|
|
% $OpenXM$ |
% $OpenXM: OpenXM/doc/ascm2001p/homogeneous-network.tex,v 1.5 2001/06/20 03:08:05 takayama Exp $ |
|
|
\subsection{Distributed computation with homogeneous servers} |
\subsection{Distributed computation with homogeneous servers} |
\label{section:homog} |
\label{section:homog} |
|
|
One of the aims of OpenXM is a parallel speedup by a distributed computation |
One of the aims of OpenXM is a parallel speedup by a distributed computation |
with homogeneous servers. As the current specification of OpenXM does |
with homogeneous servers. |
not include communication between servers, one cannot expect |
%As the current specification of OpenXM does |
the maximal parallel speedup. However it is possible to execute |
%not include communication between servers, one cannot expect |
several types of distributed computation as follows. |
%the maximal parallel speedup. However it is possible to execute |
|
%several types of distributed computation as follows. |
|
|
\subsubsection{Product of univariate polynomials} |
\subsubsection{Competitive distributed computation by various strategies} |
|
|
Shoup \cite{Shoup} showed that the product of univariate polynomials |
SINGULAR \cite{Singular} implements {\it MP} interface for distributed |
with large degrees and large coefficients can be computed efficiently |
computation and a competitive Gr\"obner basis computation is |
by FFT over small finite fields and Chinese remainder theorem, |
illustrated as an example of distributed computation. |
which can be easily parallelized. |
Such a distributed computation is also possible on OpenXM. |
% |
|
%\begin{tabbing} |
|
%Input :\= $f_1, f_2 \in {\bf Z}[x]$ such that $deg(f_1), deg(f_2) < 2^M$\\ |
|
%Output : $f = f_1f_2$ \\ |
|
%$P \leftarrow$ \= $\{m_1,\cdots,m_N\}$ where $m_i$ is an odd prime, \\ |
|
%\> $2^{M+1}|m_i-1$ and $m=\prod m_i $ is sufficiently large. \\ |
|
%Separate $P$ into disjoint subsets $P_1, \cdots, P_L$.\\ |
|
%for \= $j=1$ to $L$ $M_j \leftarrow \prod_{m_i\in P_j} m_i$\\ |
|
%Compute $F_j$ such that $F_j \equiv f_1f_2 \bmod M_j$\\ |
|
%\> and $F_j \equiv 0 \bmod m/M_j$ in parallel.\\ |
|
%\> (The product is computed by FFT.)\\ |
|
%return $\phi_m(\sum F_j)$\\ |
|
%(For $a \in {\bf Z}$, $\phi_m(a) \in (-m/2,m/2)$ and $\phi_m(a)\equiv a \bmod m$) |
|
%\end{tabbing} |
|
% |
|
Figure \ref{speedup} |
|
shows the speedup factor under the above distributed computation |
|
on Risa/Asir. For each $n$, two polynomials of degree $n$ |
|
with 3000bit coefficients are generated and the product is computed. |
|
The machine is FUJITSU AP3000, |
|
a cluster of Sun workstations connected with a high speed network |
|
and MPI over the network is used to implement OpenXM. |
|
\begin{figure}[htbp] |
|
\epsfxsize=10cm |
|
\epsffile{speedup.ps} |
|
\caption{Speedup factor} |
|
\label{speedup} |
|
\end{figure} |
|
If the number of servers is $L$ and the inputs are fixed, then the cost to |
|
compute the products modulo some integers in parallel is $O(1/L)$, |
|
whereas the cost |
|
to send and receive polynomials is $O(L)$ if {\tt ox\_push\_cmo()} and |
|
{\tt ox\_pop\_cmo()} are repeatedly applied on the client. |
|
Therefore the speedup is limited and the upper bound of |
|
the speedup factor depends on the ratio of |
|
the computational cost and the communication cost for each unit operation. |
|
Figure \ref{speedup} shows that |
|
the speedup is satisfactory if the degree is large and $L$ |
|
is not large, say, up to 10 under the above environment. |
|
If OpenXM provides collective operations for broadcast and reduction |
|
such as {\tt MPI\_Bcast} and {\tt MPI\_Reduce} respectively, the cost of |
|
broadcasting the inputs and gathering the results on the servers |
|
may be reduced to $O(\log_2L)$ |
|
and we can expect better results in such a case. In order to implement |
|
such operations we need new specifications for inter-sever communication |
|
and the session management, which will be proposed as OpenXM-RFC 102. |
|
We note that preliminary experiments show the collective operations |
|
work well on OpenXM. |
|
|
|
%\subsubsection{Competitive distributed computation by various strategies} |
\begin{verbatim} |
% |
extern Proc1,Proc2$ Proc1 = -1$ Proc2 = -1$ |
%SINGULAR \cite{Singular} implements {\it MP} interface for distributed |
/* G:set of polys; V:list of variables */ |
%computation and a competitive Gr\"obner basis computation is |
/* Mod: the Ground field GF(Mod); O:type of order */ |
%illustrated as an example of distributed computation. |
def dgr(G,V,Mod,O) |
%Such a distributed computation is also possible on OpenXM as follows: |
{ |
% |
/* invoke servers if necessary */ |
%The client creates two servers and it requests |
if ( Proc1 == -1 ) Proc1 = ox_launch(); |
%Gr\"obner basis comutations from the homogenized input and the input itself |
if ( Proc2 == -1 ) Proc2 = ox_launch(); |
%to the servers. |
P = [Proc1,Proc2]; |
%The client watches the streams by {\tt ox\_select()} |
map(ox_reset,P); /* reset servers */ |
%and the result which is returned first is taken. Then the remaining |
/* P0 executes Buchberger algorithm over GF(Mod) */ |
%server is reset. |
ox_cmo_rpc(P[0],"dp_gr_mod_main",G,V,0,Mod,O); |
% |
/* P1 executes F4 algorithm over GF(Mod) */ |
%\begin{verbatim} |
ox_cmo_rpc(P[1],"dp_f4_mod_main",G,V,Mod,O); |
%/* G:set of polys; V:list of variables */ |
map(ox_push_cmd,P,262); /* 262 = OX_popCMO */ |
%/* O:type of order; P0,P1: id's of servers */ |
F = ox_select(P); /* wait for data */ |
%def dgr(G,V,O,P0,P1) |
/* F[0] is a server's id which is ready */ |
%{ |
R = ox_get(F[0]); |
% P = [P0,P1]; /* server list */ |
if ( F[0] == P[0] ) { Win = "Buchberger"; Lose = P[1]; } |
% map(ox_reset,P); /* reset servers */ |
else { Win = "F4"; Lose = P[0]; } |
% /* P0 executes non-homogenized computation */ |
ox_reset(Lose); /* reset the loser */ |
% ox_cmo_rpc(P0,"dp_gr_main",G,V,0,1,O); |
return [Win,R]; |
% /* P1 executes homogenized computation */ |
} |
% ox_cmo_rpc(P1,"dp_gr_main",G,V,1,1,O); |
\end{verbatim} |
% map(ox_push_cmd,P,262); /* 262 = OX_popCMO */ |
In the above Asir program, the client creates two servers and it requests |
% F = ox_select(P); /* wait for data */ |
Gr\"obner basis comutations by the Buchberger algorithm the $F_4$ algorithm |
% /* F[0] is a server's id which is ready */ |
to the servers for the same input. |
% R = ox_get(F[0]); |
The client watches the streams by {\tt ox\_select()} |
% if ( F[0] == P0 ) { |
and the result which is returned first is taken. Then the remaining |
% Win = "nonhomo"; Lose = P1; |
server is reset. |
% } else { |
|
% Win = "homo"; Lose = P0; |
|
% } |
|
% ox_reset(Lose); /* reset the loser */ |
|
% return [Win,R]; |
|
%} |
|
%\end{verbatim} |
|
|
|
\subsubsection{Nesting of client-server communication} |
\subsubsection{Nesting of client-server communication} |
|
|
Under OpenXM-RFC 100 an OpenXM server can be a client of other servers. |
Under OpenXM-RFC 100 an OpenXM server can be a client of other servers. |
Figure \ref{tree} illustrates a tree-like structure of an OpenXM |
Figure \ref{tree} illustrates a tree-like structure of an OpenXM |
client-server communication. |
client-server communication. |
|
|
\begin{figure} |
\begin{figure} |
\label{tree} |
\label{tree} |
\begin{center} |
\begin{center} |
\begin{picture}(200,140)(0,0) |
\begin{picture}(200,70)(0,0) |
\put(70,120){\framebox(40,15){client}} |
\put(70,70){\framebox(40,15){client}} |
\put(20,60){\framebox(40,15){server}} |
\put(20,30){\framebox(40,15){server}} |
\put(70,60){\framebox(40,15){server}} |
\put(70,30){\framebox(40,15){server}} |
\put(120,60){\framebox(40,15){server}} |
\put(120,30){\framebox(40,15){server}} |
\put(0,0){\framebox(40,15){server}} |
\put(0,0){\framebox(40,15){server}} |
\put(50,0){\framebox(40,15){server}} |
\put(50,0){\framebox(40,15){server}} |
\put(135,0){\framebox(40,15){server}} |
\put(150,0){\framebox(40,15){server}} |
|
|
\put(90,120){\vector(-1,-1){43}} |
\put(90,70){\vector(-2,-1){43}} |
\put(90,120){\vector(0,-1){43}} |
\put(90,70){\vector(0,-1){21}} |
\put(90,120){\vector(1,-1){43}} |
\put(90,70){\vector(2,-1){43}} |
\put(40,60){\vector(-1,-2){22}} |
\put(40,30){\vector(-2,-1){22}} |
\put(40,60){\vector(1,-2){22}} |
\put(40,30){\vector(2,-1){22}} |
\put(140,60){\vector(1,-3){14}} |
\put(140,30){\vector(2,-1){22}} |
\end{picture} |
\end{picture} |
\caption{Tree-like structure of client-server communication} |
\caption{Tree-like structure of client-server communication} |
\end{center} |
\end{center} |
\end{figure} |
\end{figure} |
|
|
Such a computational model is useful for parallel implementation of |
Such a computational model is useful for parallel implementation of |
algorithms whose task can be divided into subtasks recursively. |
algorithms whose task can be divided into subtasks recursively. |
|
|
Line 186 algorithms whose task can be divided into subtasks rec |
|
Line 130 algorithms whose task can be divided into subtasks rec |
|
% } |
% } |
%} |
%} |
%\end{verbatim} |
%\end{verbatim} |
|
% |
A typical example is a parallelization of the Cantor-Zassenhaus |
A typical example is a parallelization of the Cantor-Zassenhaus |
algorithm for polynomial factorization over finite fields. |
algorithm for polynomial factorization over finite fields. |
which is a recursive algorithm. |
which is a recursive algorithm. |
|
|
% |
% |
% |
% |
% |
% |
|
|
|
\subsubsection{Product of univariate polynomials} |
|
|
|
Shoup \cite{Shoup} showed that the product of univariate polynomials |
|
with large degrees and large coefficients can be computed efficiently |
|
by FFT over small finite fields and Chinese remainder theorem. |
|
It can be easily parallelized: |
|
|
|
\begin{tabbing} |
|
Input :\= $f_1, f_2 \in {\bf Z}[x]$ such that $deg(f_1), deg(f_2) < 2^M$\\ |
|
Output : $f = f_1f_2$ \\ |
|
$P \leftarrow$ \= $\{m_1,\cdots,m_N\}$ where $m_i$ is an odd prime, \\ |
|
\> $2^{M+1}|m_i-1$ and $m=\prod m_i $ is sufficiently large. \\ |
|
Separate $P$ into disjoint subsets $P_1, \cdots, P_L$.\\ |
|
for \= $j=1$ to $L$ $M_j \leftarrow \prod_{m_i\in P_j} m_i$\\ |
|
Compute $F_j$ such that $F_j \equiv f_1f_2 \bmod M_j$\\ |
|
\> and $F_j \equiv 0 \bmod m/M_j$ in parallel.\\ |
|
\> (The product is computed by FFT.)\\ |
|
return $\phi_m(\sum F_j)$\\ |
|
(For $a \in {\bf Z}$, $\phi_m(a) \in (-m/2,m/2)$ and $\phi_m(a)\equiv a \bmod m$) |
|
\end{tabbing} |
|
|
|
Figure \ref{speedup} |
|
shows the speedup factor under the above distributed computation |
|
on Risa/Asir. For each $n$, two polynomials of degree $n$ |
|
with 3000bit coefficients are generated and the product is computed. |
|
The machine is FUJITSU AP3000, |
|
a cluster of Sun workstations connected with a high speed network |
|
and MPI over the network is used to implement OpenXM. |
|
\begin{figure}[htbp] |
|
\epsfxsize=8.5cm |
|
\epsffile{speedup.ps} |
|
\caption{Speedup factor} |
|
\label{speedup} |
|
\end{figure} |
|
|
|
If the number of servers is $L$ and the inputs are fixed, then the cost to |
|
compute $F_j$ in parallel is $O(1/L)$, whereas the cost |
|
to send and receive polynomials is $O(L)$ if {\tt ox\_push\_cmo()} and |
|
{\tt ox\_pop\_cmo()} are repeatedly applied on the client. |
|
Therefore the speedup is limited and the upper bound of |
|
the speedup factor depends on the ratio of |
|
the computational cost and the communication cost for each unit operation. |
|
Figure \ref{speedup} shows that |
|
the speedup is satisfactory if the degree is large and $L$ |
|
is not large, say, up to 10 under the above environment. |
|
If OpenXM provides collective operations for broadcast and reduction |
|
such as {\tt MPI\_Bcast} and {\tt MPI\_Reduce} respectively, the cost of |
|
sending $f_1$, $f_2$ and gathering $F_j$ may be reduced to $O(\log_2L)$ |
|
and we can expect better results in such a case. In order to implement |
|
such operations we need new specifications for inter-sever communication |
|
and the session management, which will be proposed as OpenXM-RFC 102. |
|
We note that preliminary experiments show the collective operations |
|
work well on OpenXM. |
|
|
|
%\subsubsection{Competitive distributed computation by various strategies} |
|
% |
|
%SINGULAR \cite{Singular} implements {\it MP} interface for distributed |
|
%computation and a competitive Gr\"obner basis computation is |
|
%illustrated as an example of distributed computation. |
|
%Such a distributed computation is also possible on OpenXM as follows: |
|
% |
|
%The client creates two servers and it requests |
|
%Gr\"obner basis comutations from the homogenized input and the input itself |
|
%to the servers. |
|
%The client watches the streams by {\tt ox\_select()} |
|
%and the result which is returned first is taken. Then the remaining |
|
%server is reset. |
|
% |
|
%\begin{verbatim} |
|
%/* G:set of polys; V:list of variables */ |
|
%/* O:type of order; P0,P1: id's of servers */ |
|
%def dgr(G,V,O,P0,P1) |
|
%{ |
|
% P = [P0,P1]; /* server list */ |
|
% map(ox_reset,P); /* reset servers */ |
|
% /* P0 executes non-homogenized computation */ |
|
% ox_cmo_rpc(P0,"dp_gr_main",G,V,0,1,O); |
|
% /* P1 executes homogenized computation */ |
|
% ox_cmo_rpc(P1,"dp_gr_main",G,V,1,1,O); |
|
% map(ox_push_cmd,P,262); /* 262 = OX_popCMO */ |
|
% F = ox_select(P); /* wait for data */ |
|
% /* F[0] is a server's id which is ready */ |
|
% R = ox_get(F[0]); |
|
% if ( F[0] == P0 ) { |
|
% Win = "nonhomo"; Lose = P1; |
|
% } else { |
|
% Win = "homo"; Lose = P0; |
|
% } |
|
% ox_reset(Lose); /* reset the loser */ |
|
% return [Win,R]; |
|
%} |
|
%\end{verbatim} |
|
|