[BACK]Return to homogeneous-network.tex CVS log [TXT][DIR] Up to [local] / OpenXM / doc / ascm2001p

Annotation of OpenXM/doc/ascm2001p/homogeneous-network.tex, Revision 1.4

1.4     ! noro        1: % $OpenXM: OpenXM/doc/ascm2001p/homogeneous-network.tex,v 1.3 2001/06/20 02:39:25 noro Exp $
1.1       noro        2:
                      3: \subsection{Distributed computation with homogeneous servers}
                      4: \label{section:homog}
                      5:
                      6: One of the aims of OpenXM is a parallel speedup by a distributed computation
                      7: with homogeneous servers. As the current specification of OpenXM does
                      8: not include communication between servers, one cannot expect
                      9: the maximal parallel speedup. However it is possible to execute
                     10: several types of distributed computation as follows.
                     11:
1.3       noro       12: \subsubsection{Competitive distributed computation by various strategies}
1.1       noro       13:
1.3       noro       14: SINGULAR \cite{Singular} implements {\it MP} interface for distributed
                     15: computation and a competitive Gr\"obner basis computation is
                     16: illustrated as an example of distributed computation.
                     17: Such a distributed computation is also possible on OpenXM as follows:
                     18:
                     19: The client creates two servers and it requests
                     20: Gr\"obner basis comutations by the Buchberger algorithm the $F_4$ algorithm
                     21: to the servers for the same input.
                     22: The client watches the streams by {\tt ox\_select()}
                     23: and the result which is returned first is taken. Then the remaining
                     24: server is reset.
                     25:
                     26: \begin{verbatim}
                     27: extern Proc1,Proc2$
                     28: Proc1 = -1$ Proc2 = -1$
                     29: /* G:set of polys; V:list of variables */
                     30: /* Mod: the Ground field GF(Mod); O:type of order */
                     31: def dgr(G,V,Mod,O)
                     32: {
                     33:   /* invoke servers if necessary */
                     34:   if ( Proc1 == -1 ) Proc1 = ox_launch();
                     35:   if ( Proc2 == -1 ) Proc2 = ox_launch();
                     36:   P = [Proc1,Proc2];
                     37:   map(ox_reset,P); /* reset servers */
                     38:   /* P0 executes Buchberger algorithm over GF(Mod) */
                     39:   ox_cmo_rpc(P[0],"dp_gr_mod_main",G,V,0,Mod,O);
                     40:   /* P1 executes F4 algorithm over GF(Mod) */
                     41:   ox_cmo_rpc(P[1],"dp_f4_mod_main",G,V,Mod,O);
                     42:   map(ox_push_cmd,P,262); /* 262 = OX_popCMO */
                     43:   F = ox_select(P); /* wait for data */
                     44:   /* F[0] is a server's id which is ready */
                     45:   R = ox_get(F[0]);
                     46:   if ( F[0] == P[0] ) { Win = "Buchberger"; Lose = P[1]; }
                     47:   else { Win = "F4"; Lose = P[0]; }
                     48:   ox_reset(Lose); /* reset the loser */
                     49:   return [Win,R];
                     50: }
                     51: \end{verbatim}
1.1       noro       52:
1.4     ! noro       53: \subsubsection{Nesting of client-server communication}
        !            54:
        !            55: Under OpenXM-RFC 100 an OpenXM server can be a client of other servers.
        !            56: Figure \ref{tree} illustrates a tree-like structure of an OpenXM
        !            57: client-server communication.
        !            58: \begin{figure}
        !            59: \label{tree}
        !            60: \begin{center}
        !            61: \begin{picture}(200,70)(0,0)
        !            62: \put(70,70){\framebox(40,15){client}}
        !            63: \put(20,30){\framebox(40,15){server}}
        !            64: \put(70,30){\framebox(40,15){server}}
        !            65: \put(120,30){\framebox(40,15){server}}
        !            66: \put(0,0){\framebox(40,15){server}}
        !            67: \put(50,0){\framebox(40,15){server}}
        !            68: \put(150,0){\framebox(40,15){server}}
        !            69:
        !            70: \put(90,70){\vector(-2,-1){43}}
        !            71: \put(90,70){\vector(0,-1){21}}
        !            72: \put(90,70){\vector(2,-1){43}}
        !            73: \put(40,30){\vector(-2,-1){22}}
        !            74: \put(40,30){\vector(2,-1){22}}
        !            75: \put(140,30){\vector(2,-1){22}}
        !            76: \end{picture}
        !            77: \caption{Tree-like structure of client-server communication}
        !            78: \end{center}
        !            79: \end{figure}
        !            80: Such a computational model is useful for parallel implementation of
        !            81: algorithms whose task can be divided into subtasks recursively.
        !            82:
1.1       noro       83: %A typical example is {\it quicksort}, where an array to be sorted is
                     84: %partitioned into two sub-arrays and the algorithm is applied to each
                     85: %sub-array. In each level of recursion, two subtasks are generated
                     86: %and one can ask other OpenXM servers to execute them.
                     87: %Though it makes little contribution to the efficiency in the case of
                     88: %quicksort, we present an Asir program of this distributed quicksort
                     89: %to demonstrate that OpenXM gives an easy way to test this algorithm.
                     90: %In the program, a predefined constant {\tt LevelMax} determines
                     91: %whether new servers are launched or whole subtasks are done on the server.
                     92: %
                     93: %\begin{verbatim}
                     94: %#define LevelMax 2
                     95: %extern Proc1, Proc2;
                     96: %Proc1 = -1$ Proc2 = -1$
                     97: %
                     98: %/* sort [A[P],...,A[Q]] by quicksort */
                     99: %def quickSort(A,P,Q,Level) {
                    100: %  if (Q-P < 1) return A;
                    101: %  Mp = idiv(P+Q,2); M = A[Mp]; B = P; E = Q;
                    102: %  while (1) {
                    103: %    while (A[B] < M) B++;
                    104: %    while (A[E] > M && B <= E) E--;
                    105: %    if (B >= E) break;
                    106: %    else { T = A[B]; A[B] = A[E]; A[E] = T; E--; }
                    107: %  }
                    108: %  if (E < P) E = P;
                    109: %  if (Level < LevelMax) {
                    110: %   /* launch new servers if necessary */
                    111: %   if (Proc1 == -1) Proc1 = ox_launch(0);
                    112: %   if (Proc2 == -1) Proc2 = ox_launch(0);
                    113: %   /* send the requests to the servers */
                    114: %   ox_rpc(Proc1,"quickSort",A,P,E,Level+1);
                    115: %   ox_rpc(Proc2,"quickSort",A,E+1,Q,Level+1);
                    116: %   if (E-P < Q-E) {
                    117: %     A1 = ox_pop_local(Proc1);
                    118: %     A2 = ox_pop_local(Proc2);
                    119: %   }else{
                    120: %     A2 = ox_pop_local(Proc2);
                    121: %     A1 = ox_pop_local(Proc1);
                    122: %   }
                    123: %   for (I=P; I<=E; I++) A[I] = A1[I];
                    124: %   for (I=E+1; I<=Q; I++) A[I] = A2[I];
                    125: %   return(A);
                    126: %  }else{
                    127: %   /* everything is done on this server */
                    128: %   quickSort(A,P,E,Level+1);
                    129: %   quickSort(A,E+1,Q,Level+1);
                    130: %   return(A);
                    131: %  }
                    132: %}
                    133: %\end{verbatim}
1.3       noro      134: %
1.4     ! noro      135: A typical example is a parallelization of the Cantor-Zassenhaus
        !           136: algorithm for polynomial factorization over finite fields.
        !           137: which is a recursive algorithm.
        !           138: At each level of the recursion, a given polynomial can be
        !           139: divided into two non-trivial factors with some probability by using
        !           140: a randomly generated polynomial as a {\it separator}.
        !           141: We can apply the following simple parallelization:
        !           142: When two non-trivial factors are generated on a server,
        !           143: one is sent to another server and the other factor is factorized on the server
        !           144: itself.
1.1       noro      145: %\begin{verbatim}
                    146: %/* factorization of F */
                    147: %/* E = degree of irreducible factors in F */
                    148: %def c_z(F,E,Level)
                    149: %{
                    150: %  V = var(F); N = deg(F,V);
                    151: %  if ( N == E ) return [F];
                    152: %  M = field_order_ff(); K = idiv(N,E); L = [F];
                    153: %  while ( 1 ) {
                    154: %    /* gererate a random polynomial */
                    155: %    W = monic_randpoly_ff(2*E,V);
                    156: %    /* compute a power of the random polynomial */
                    157: %    T = generic_pwrmod_ff(W,F,idiv(M^E-1,2));
                    158: %    if ( !(W = T-1) ) continue;
                    159: %    /* G = GCD(F,W^((M^E-1)/2)) mod F) */
                    160: %    G = ugcd(F,W);
                    161: %    if ( deg(G,V) && deg(G,V) < N ) {
                    162: %      /* G is a non-trivial factor of F */
                    163: %      if ( Level >= LevelMax ) {
                    164: %        /* everything is done on this server */
                    165: %        L1 = c_z(G,E,Level+1);
                    166: %        L2 = c_z(sdiv(F,G),E,Level+1);
                    167: %      } else {
                    168: %        /* launch a server if necessary */
                    169: %        if ( Proc1 < 0 ) Proc1 = ox_launch();
                    170: %        /* send a request with Level = Level+1 */
                    171: %        /* ox_c_z is a wrapper of c_z on the server */
                    172: %        ox_cmo_rpc(Proc1,"ox_c_z",lmptop(G),E,
                    173: %            setmod_ff(),Level+1);
                    174: %        /* the rest is done on this server */
                    175: %        L2 = c_z(sdiv(F,G),E,Level+1);
                    176: %        L1 = map(simp_ff,ox_pop_cmo(Proc1));
                    177: %      }
                    178: %      return append(L1,L2);
                    179: %    }
                    180: %  }
                    181: %}
                    182: %\end{verbatim}
                    183: %
                    184: %
                    185: %
                    186: %
                    187: %
                    188: %
                    189: %
1.2       noro      190:
                    191: \subsubsection{Product of univariate polynomials}
                    192:
                    193: Shoup \cite{Shoup} showed that the product of univariate polynomials
                    194: with large degrees and large coefficients can be computed efficiently
                    195: by FFT over small finite fields and Chinese remainder theorem.
                    196: It can be easily parallelized:
                    197:
                    198: \begin{tabbing}
                    199: Input :\= $f_1, f_2 \in {\bf Z}[x]$ such that $deg(f_1), deg(f_2) < 2^M$\\
                    200: Output : $f = f_1f_2$ \\
                    201: $P \leftarrow$ \= $\{m_1,\cdots,m_N\}$ where $m_i$ is an odd prime, \\
                    202: \> $2^{M+1}|m_i-1$ and $m=\prod m_i $ is sufficiently large. \\
                    203: Separate $P$ into disjoint subsets $P_1, \cdots, P_L$.\\
                    204: for \= $j=1$ to $L$ $M_j \leftarrow \prod_{m_i\in P_j} m_i$\\
                    205: Compute $F_j$ such that $F_j \equiv f_1f_2 \bmod M_j$\\
                    206: \> and $F_j \equiv 0 \bmod m/M_j$ in parallel.\\
                    207: \> (The product is computed by FFT.)\\
                    208: return $\phi_m(\sum F_j)$\\
                    209: (For $a \in {\bf Z}$, $\phi_m(a) \in (-m/2,m/2)$ and $\phi_m(a)\equiv a \bmod m$)
                    210: \end{tabbing}
                    211:
                    212: Figure \ref{speedup}
                    213: shows the speedup factor under the above distributed computation
                    214: on Risa/Asir. For each $n$, two polynomials of degree $n$
                    215: with 3000bit coefficients are generated and the product is computed.
                    216: The machine is FUJITSU AP3000,
                    217: a cluster of Sun workstations connected with a high speed network
                    218: and MPI over the network is used to implement OpenXM.
                    219: \begin{figure}[htbp]
                    220: \epsfxsize=8.5cm
                    221: \epsffile{speedup.ps}
                    222: \caption{Speedup factor}
                    223: \label{speedup}
                    224: \end{figure}
                    225:
                    226: If the number of servers is $L$ and the inputs are fixed, then the cost to
                    227: compute $F_j$ in parallel is $O(1/L)$, whereas the cost
                    228: to send and receive polynomials is $O(L)$ if {\tt ox\_push\_cmo()} and
                    229: {\tt ox\_pop\_cmo()} are repeatedly applied on the client.
                    230: Therefore the speedup is limited and the upper bound of
                    231: the speedup factor depends on the ratio of
                    232: the computational cost and the communication cost for each unit operation.
                    233: Figure \ref{speedup} shows that
                    234: the speedup is satisfactory if the degree is large and $L$
                    235: is not large, say, up to 10 under the above environment.
                    236: If OpenXM provides collective operations for broadcast and reduction
                    237: such as {\tt MPI\_Bcast} and {\tt MPI\_Reduce} respectively, the cost of
                    238: sending $f_1$, $f_2$ and gathering $F_j$ may be reduced to $O(\log_2L)$
                    239: and we can expect better results in such a case. In order to implement
                    240: such operations we need new specifications for inter-sever communication
                    241: and the session management, which will be proposed as OpenXM-RFC 102.
                    242: We note that preliminary experiments show the collective operations
                    243: work well on OpenXM.
                    244:
                    245: %\subsubsection{Competitive distributed computation by various strategies}
                    246: %
                    247: %SINGULAR \cite{Singular} implements {\it MP} interface for distributed
                    248: %computation and a competitive Gr\"obner basis computation is
                    249: %illustrated as an example of distributed computation.
                    250: %Such a distributed computation is also possible on OpenXM as follows:
                    251: %
                    252: %The client creates two servers and it requests
                    253: %Gr\"obner basis comutations from the homogenized input and the input itself
                    254: %to the servers.
                    255: %The client watches the streams by {\tt ox\_select()}
                    256: %and the result which is returned first is taken. Then the remaining
                    257: %server is reset.
                    258: %
                    259: %\begin{verbatim}
                    260: %/* G:set of polys; V:list of variables */
                    261: %/* O:type of order; P0,P1: id's of servers */
                    262: %def dgr(G,V,O,P0,P1)
                    263: %{
                    264: %  P = [P0,P1]; /* server list */
                    265: %  map(ox_reset,P); /* reset servers */
                    266: %  /* P0 executes non-homogenized computation */
                    267: %  ox_cmo_rpc(P0,"dp_gr_main",G,V,0,1,O);
                    268: %  /* P1 executes homogenized computation */
                    269: %  ox_cmo_rpc(P1,"dp_gr_main",G,V,1,1,O);
                    270: %  map(ox_push_cmd,P,262); /* 262 = OX_popCMO */
                    271: %  F = ox_select(P); /* wait for data */
                    272: %  /* F[0] is a server's id which is ready */
                    273: %  R = ox_get(F[0]);
                    274: %  if ( F[0] == P0 ) {
                    275: %    Win = "nonhomo"; Lose = P1;
                    276: %  } else {
                    277: %    Win = "homo"; Lose = P0;
                    278: %  }
                    279: %  ox_reset(Lose); /* reset the loser */
                    280: %  return [Win,R];
                    281: %}
                    282: %\end{verbatim}
                    283:

FreeBSD-CVSweb <freebsd-cvsweb@FreeBSD.org>