===================================================================
RCS file: /home/cvs/OpenXM/doc/ascm2001p/homogeneous-network.tex,v
retrieving revision 1.1
retrieving revision 1.8
diff -u -p -r1.1 -r1.8
--- OpenXM/doc/ascm2001p/homogeneous-network.tex	2001/06/19 07:32:58	1.1
+++ OpenXM/doc/ascm2001p/homogeneous-network.tex	2001/06/21 03:09:46	1.8
@@ -1,137 +1,85 @@
-% $OpenXM$
+% $OpenXM: OpenXM/doc/ascm2001p/homogeneous-network.tex,v 1.7 2001/06/20 05:42:47 takayama Exp $
 
 \subsection{Distributed computation with homogeneous servers}
 \label{section:homog}
 
 One of the aims of OpenXM is a parallel speedup by a distributed computation
-with homogeneous servers. As the current specification of OpenXM does
-not include communication between servers, one cannot expect
-the maximal parallel speedup. However it is possible to execute
-several types of distributed computation as follows.
+with homogeneous servers.  Let us see some examples.
+%As the current specification of OpenXM does
+%not include communication between servers, one cannot expect
+%the maximal parallel speedup. However it is possible to execute
+%several types of distributed computation as follows.
 
-\subsubsection{Product of univariate polynomials}
+\subsubsection{Competitive distributed computation by various strategies}
 
-Shoup \cite{Shoup} showed that the product of univariate polynomials
-with large degrees and large coefficients can be computed efficiently
-by FFT over small finite fields and Chinese remainder theorem,
-which can be easily parallelized.
-%
-%\begin{tabbing}
-%Input :\= $f_1, f_2 \in {\bf Z}[x]$ such that $deg(f_1), deg(f_2) < 2^M$\\
-%Output : $f = f_1f_2$ \\
-%$P \leftarrow$ \= $\{m_1,\cdots,m_N\}$ where $m_i$ is an odd prime, \\
-%\> $2^{M+1}|m_i-1$ and $m=\prod m_i $ is sufficiently large. \\
-%Separate $P$ into disjoint subsets $P_1, \cdots, P_L$.\\
-%for \= $j=1$ to $L$ $M_j \leftarrow \prod_{m_i\in P_j} m_i$\\
-%Compute $F_j$ such that $F_j \equiv f_1f_2 \bmod M_j$\\
-%\> and $F_j \equiv 0 \bmod m/M_j$ in parallel.\\
-%\> (The product is computed by FFT.)\\
-%return $\phi_m(\sum F_j)$\\
-%(For $a \in {\bf Z}$, $\phi_m(a) \in (-m/2,m/2)$ and $\phi_m(a)\equiv a \bmod m$)
-%\end{tabbing}
-%
-Figure \ref{speedup}
-shows the speedup factor under the above distributed computation
-on Risa/Asir. For each $n$, two polynomials of degree $n$
-with 3000bit coefficients are generated and the product is computed.
-The machine is FUJITSU AP3000,
-a cluster of Sun workstations connected with a high speed network 
-and MPI over the network is used to implement OpenXM.
-\begin{figure}[htbp]
-\epsfxsize=10cm
-\epsffile{speedup.ps}
-\caption{Speedup factor}
-\label{speedup}
-\end{figure}
-If the number of servers is $L$ and the inputs are fixed, then the cost to
-compute the products modulo some integers in parallel is $O(1/L)$, 
-whereas the cost
-to send and receive polynomials is $O(L)$ if {\tt ox\_push\_cmo()} and
-{\tt ox\_pop\_cmo()} are repeatedly applied on the client.
-Therefore the speedup is limited and the upper bound of
-the speedup factor depends on the ratio of 
-the computational cost and the communication cost for each unit operation.
-Figure \ref{speedup} shows that 
-the speedup is satisfactory if the degree is large and $L$
-is not large, say, up to 10 under the above environment.
-If OpenXM provides collective operations for broadcast and reduction
-such as {\tt MPI\_Bcast} and {\tt MPI\_Reduce} respectively, the cost of 
-broadcasting the inputs and gathering the results on the servers
-may be reduced to $O(\log_2L)$
-and we can expect better results in such a case. In order to implement
-such operations we need new specifications for inter-sever communication
-and the session management, which will be proposed as OpenXM-RFC 102.
-We note that preliminary experiments show the collective operations
-work well on OpenXM.
+SINGULAR \cite{Singular} implements MP interface for distributed
+computation and a competitive Gr\"obner basis computation is
+illustrated as an example of distributed computation by the interface.
+Such a distributed computation is also possible on OpenXM.
 
-%\subsubsection{Competitive distributed computation by various strategies}
-%
-%SINGULAR \cite{Singular} implements {\it MP} interface for distributed
-%computation and a competitive Gr\"obner basis computation is
-%illustrated as an example of distributed computation.
-%Such a distributed computation is also possible on OpenXM as follows:
-%
-%The client creates two servers and it requests 
-%Gr\"obner basis comutations from the homogenized input and the input itself
-%to the servers.
-%The client watches the streams by {\tt ox\_select()}
-%and the result which is returned first is taken. Then the remaining
-%server is reset.
-%
-%\begin{verbatim}
-%/* G:set of polys; V:list of variables */
-%/* O:type of order; P0,P1: id's of servers */
-%def dgr(G,V,O,P0,P1)
-%{
-%  P = [P0,P1]; /* server list */
-%  map(ox_reset,P); /* reset servers */
-%  /* P0 executes non-homogenized computation */
-%  ox_cmo_rpc(P0,"dp_gr_main",G,V,0,1,O);
-%  /* P1 executes homogenized computation */
-%  ox_cmo_rpc(P1,"dp_gr_main",G,V,1,1,O);
-%  map(ox_push_cmd,P,262); /* 262 = OX_popCMO */
-%  F = ox_select(P); /* wait for data */
-%  /* F[0] is a server's id which is ready */
-%  R = ox_get(F[0]);
-%  if ( F[0] == P0 ) {
-%    Win = "nonhomo"; Lose = P1;
-%  } else {
-%    Win = "homo"; Lose = P0;
-%  }
-%  ox_reset(Lose); /* reset the loser */
-%  return [Win,R];
-%}
-%\end{verbatim}
+\begin{verbatim}
+extern Proc1,Proc2$
+Proc1 = -1$ Proc2 = -1$
+/* G:set of polys; V:list of variables */
+/* Mod: the Ground field GF(Mod); O:type of order */
+def dgr(G,V,Mod,O)
+{
+  /* invoke servers if necessary */
+  if ( Proc1 == -1 ) Proc1 = ox_launch();
+  if ( Proc2 == -1 ) Proc2 = ox_launch();
+  P = [Proc1,Proc2];
+  map(ox_reset,P); /* reset servers */
+  /* P0 executes Buchberger algorithm over GF(Mod) */
+  ox_cmo_rpc(P[0],"dp_gr_mod_main",G,V,0,Mod,O);
+  /* P1 executes F4 algorithm over GF(Mod) */
+  ox_cmo_rpc(P[1],"dp_f4_mod_main",G,V,Mod,O);
+  map(ox_push_cmd,P,262); /* 262 = OX_popCMO */
+  F = ox_select(P); /* wait for data */
+  /* F[0] is a server's id which is ready */
+  R = ox_get(F[0]);
+  if ( F[0] == P[0] ) { Win = "Buchberger"; Lose = P[1]; }
+  else { Win = "F4"; Lose = P[0]; }
+  ox_reset(Lose); /* reset the loser */
+  return [Win,R];
+}
+\end{verbatim}
+In the above Asir program, the client creates two servers and it requests 
+Gr\"obner basis computations by the Buchberger algorithm 
+and the $F_4$ algorithm to the servers for the same input.
+The client watches the streams by {\tt ox\_select()}
+and the result which is returned first is taken. Then the remaining
+server is reset.
 
 \subsubsection{Nesting of client-server communication}
 
-Under OpenXM-RFC 100 an OpenXM server can be a client of other servers.
-Figure \ref{tree} illustrates a tree-like structure of an OpenXM
-client-server communication.
-
 \begin{figure}
 \label{tree}
 \begin{center}
-\begin{picture}(200,140)(0,0)
-\put(70,120){\framebox(40,15){client}}
-\put(20,60){\framebox(40,15){server}}
-\put(70,60){\framebox(40,15){server}}
-\put(120,60){\framebox(40,15){server}}
+\begin{picture}(200,70)(0,0)
+\put(70,70){\framebox(40,15){client}}
+\put(20,30){\framebox(40,15){server}}
+\put(70,30){\framebox(40,15){server}}
+\put(120,30){\framebox(40,15){server}}
 \put(0,0){\framebox(40,15){server}}
 \put(50,0){\framebox(40,15){server}}
-\put(135,0){\framebox(40,15){server}}
+\put(150,0){\framebox(40,15){server}}
 
-\put(90,120){\vector(-1,-1){43}}
-\put(90,120){\vector(0,-1){43}}
-\put(90,120){\vector(1,-1){43}}
-\put(40,60){\vector(-1,-2){22}}
-\put(40,60){\vector(1,-2){22}}
-\put(140,60){\vector(1,-3){14}}
+\put(90,70){\vector(-2,-1){43}}
+\put(90,70){\vector(0,-1){21}}
+\put(90,70){\vector(2,-1){43}}
+\put(40,30){\vector(-2,-1){22}}
+\put(40,30){\vector(2,-1){22}}
+\put(140,30){\vector(2,-1){22}}
 \end{picture}
 \caption{Tree-like structure of client-server communication}
 \end{center}
 \end{figure}
-
+%%Prog:  load ("dfff"); df_demo();  enter 100.
+Under OpenXM-RFC 100 an OpenXM server can be a client of other servers.
+%Figure \ref{tree}
+Figure 2
+illustrates a tree-like structure of an OpenXM
+client-server communication.
 Such a computational model is useful for parallel implementation of
 algorithms whose task can be divided into subtasks recursively.  
 
@@ -186,15 +134,15 @@ algorithms whose task can be divided into subtasks rec
 %  }
 %}
 %\end{verbatim}
-
+%
 A typical example is a parallelization of the Cantor-Zassenhaus
-algorithm for polynomial factorization over finite fields.
+algorithm for polynomial factorization over finite fields,
 which is a recursive algorithm.
 At each level of the recursion, a given polynomial can be
 divided into two non-trivial factors with some probability by using 
 a randomly generated polynomial as a {\it separator}.
 We can apply the following simple parallelization:
-When two non-trivial factors are generated on a server,
+when two non-trivial factors are generated on a server,
 one is sent to another server and the other factor is factorized on the server
 itself. 
 %\begin{verbatim}
@@ -206,7 +154,7 @@ itself. 
 %  if ( N == E ) return [F];
 %  M = field_order_ff(); K = idiv(N,E); L = [F];
 %  while ( 1 ) {
-%    /* gererate a random polynomial */
+%    /* generate a random polynomial */
 %    W = monic_randpoly_ff(2*E,V);
 %    /* compute a power of the random polynomial */
 %    T = generic_pwrmod_ff(W,F,idiv(M^E-1,2));
@@ -242,3 +190,97 @@ itself. 
 %
 %
 %
+
+\subsubsection{Product of univariate polynomials}
+
+Shoup \cite{Shoup} showed that the product of univariate polynomials
+with large degrees and large coefficients can be computed efficiently
+by FFT over small finite fields and Chinese remainder theorem.
+It can be easily parallelized:
+
+\begin{tabbing}
+Input :\= $f_1, f_2 \in {\bf Z}[x]$ such that $deg(f_1), deg(f_2) < 2^M$\\
+Output : $f = f_1f_2$ \\
+$P \leftarrow$ \= $\{m_1,\cdots,m_N\}$ where $m_i$ is an odd prime, \\
+\> $2^{M+1}|m_i-1$ and $m=\prod m_i $ is sufficiently large. \\
+Separate $P$ into disjoint subsets $P_1, \cdots, P_L$.\\
+for \= $j=1$ to $L$ $M_j \leftarrow \prod_{m_i\in P_j} m_i$\\
+Compute $F_j$ such that $F_j \equiv f_1f_2 \bmod M_j$\\
+\> and $F_j \equiv 0 \bmod m/M_j$ in parallel.\\
+\> (The product is computed by FFT.)\\
+return $\phi_m(\sum F_j)$\\
+(For $a \in {\bf Z}$, $\phi_m(a) \in (-m/2,m/2)$ and $\phi_m(a)\equiv a \bmod m$)
+\end{tabbing}
+
+Figure \ref{speedup}
+shows the speedup factor under the above distributed computation
+on Risa/Asir. For each $n$, two polynomials of degree $n$
+with 3000bit coefficients are generated and the product is computed.
+The machine is FUJITSU AP3000,
+a cluster of Sun workstations connected with a high speed network 
+and MPI over the network is used to implement OpenXM.
+\begin{figure}[htbp]
+\epsfxsize=8.5cm
+\epsffile{speedup.ps}
+\caption{Speedup factor}
+\label{speedup}
+\end{figure}
+
+If the number of servers is $L$ and the inputs are fixed, then the cost to
+compute $F_j$ in parallel is $O(1/L)$, whereas the cost
+to send and receive polynomials is $O(L)$ if {\tt ox\_push\_cmo()} and
+{\tt ox\_pop\_cmo()} are repeatedly applied on the client.
+Therefore the speedup is limited and the upper bound of
+the speedup factor depends on the ratio of 
+the computational cost and the communication cost for each unit operation.
+Figure \ref{speedup} shows that 
+the speedup is satisfactory if the degree is large and $L$
+is not large, say, up to 10 under the above environment.
+If OpenXM provides collective operations for broadcast and reduction
+such as {\tt MPI\_Bcast} and {\tt MPI\_Reduce} respectively, the cost of 
+sending $f_1$, $f_2$ and gathering $F_j$ may be reduced to $O(\log_2L)$
+and we can expect better results in such a case. In order to implement
+such operations we need new specifications for inter-sever communication
+and the session management, which will be proposed as OpenXM-RFC 102.
+We note that preliminary experiments show the collective operations
+work well on OpenXM.
+
+%\subsubsection{Competitive distributed computation by various strategies}
+%
+%SINGULAR \cite{Singular} implements {\it MP} interface for distributed
+%computation and a competitive Gr\"obner basis computation is
+%illustrated as an example of distributed computation.
+%Such a distributed computation is also possible on OpenXM as follows:
+%
+%The client creates two servers and it requests 
+%Gr\"obner basis computations from the homogenized input and the input itself
+%to the servers.
+%The client watches the streams by {\tt ox\_select()}
+%and the result which is returned first is taken. Then the remaining
+%server is reset.
+%
+%\begin{verbatim}
+%/* G:set of polys; V:list of variables */
+%/* O:type of order; P0,P1: id's of servers */
+%def dgr(G,V,O,P0,P1)
+%{
+%  P = [P0,P1]; /* server list */
+%  map(ox_reset,P); /* reset servers */
+%  /* P0 executes non-homogenized computation */
+%  ox_cmo_rpc(P0,"dp_gr_main",G,V,0,1,O);
+%  /* P1 executes homogenized computation */
+%  ox_cmo_rpc(P1,"dp_gr_main",G,V,1,1,O);
+%  map(ox_push_cmd,P,262); /* 262 = OX_popCMO */
+%  F = ox_select(P); /* wait for data */
+%  /* F[0] is a server's id which is ready */
+%  R = ox_get(F[0]);
+%  if ( F[0] == P0 ) {
+%    Win = "nonhomo"; Lose = P1;
+%  } else {
+%    Win = "homo"; Lose = P0;
+%  }
+%  ox_reset(Lose); /* reset the loser */
+%  return [Win,R];
+%}
+%\end{verbatim}
+