NEURON
multisplit.cpp
Go to the documentation of this file.
1 #include <../../nrnconf.h>
2 
3 #include <stdio.h>
4 #include <stdlib.h>
5 #include <errno.h>
6 #include <InterViews/resource.h>
7 #include <OS/list.h>
8 #include <nrnoc2iv.h>
9 #include <nrnmpi.h>
10 #include <multisplit.h>
11 #include <unordered_map>
12 #include <memory>
13 
14 void nrnmpi_multisplit(Section*, double x, int sid, int backbone_style);
16 
17 extern int tree_changed;
18 extern int diam_changed;
19 extern void setup_topology();
20 extern void nrn_cachevec(int);
21 extern void nrn_matrix_node_free();
22 extern void (*nrn_multisplit_setup_)();
23 extern void* nrn_multisplit_triang(NrnThread*);
25 extern void* nrn_multisplit_bksub(NrnThread*);
26 extern double t;
27 
35 extern void (*nrn_multisplit_solve_)();
36 static void multisplit_v_setup();
37 static void multisplit_solve();
38 
39 extern double nrnmpi_rtcomp_time_;
40 #if PARANEURON
41 extern double nrnmpi_splitcell_wait_;
42 #else
43 static double nrnmpi_splitcell_wait_;
44 #endif
45 
46 #if NRNMPI
47 #else
48 static int nrnmpi_use;
49 static void nrnmpi_int_allgather(int*, int*, int){}
50 static void nrnmpi_int_allgatherv(int*, int*, int*, int*){}
51 static void nrnmpi_postrecv_doubles(double*, int, int, int, void**){}
52 static void nrnmpi_send_doubles(double*, int, int, int){}
53 static void nrnmpi_wait(void**){}
54 static void nrnmpi_barrier(){}
55 static double nrnmpi_wtime() { return 0.0; }
56 #endif
57 
58 class MultiSplit;
59 class MultiSplitControl;
60 
61 #define A(i) VEC_A(i)
62 #define B(i) VEC_B(i)
63 #define D(i) VEC_D(i)
64 #define RHS(i) VEC_RHS(i)
65 #define S1A(i) sid1A[i]
66 #define S1B(i) sid1B[i]
67 
68 // any number of nodes can have the same sid (generally those nodes
69 // will be on different machines).
70 // A node cannot have more than one sid.
71 // A tree cannot have more than two nodes with an sid and those nodes
72 // must be distinct.
73 
74 // NOTE:
75 // For cache efficiency, it would be nice to keep individual cell nodes
76 // together (i.e v_node from 0 to rootnodecount-1 are NOT the root nodes.)
77 // This would allow keeping the backbones contiguous and would simplify the
78 // solution of the sid0,1 2x2 at the end of triangularization that forms
79 // an N shaped matrix. Unfortunately, in several places, e.g. matrix setup,
80 // the first rootnodecount v_node are assumed to be the individual cell roots
81 // and at this time it would be confusing to work on that tangential issue.
82 // However, it does mean that we need some special structure since
83 // sid0 ...backbone interior nodes... sid1 are not contiguous in the v_node
84 // for a single cell.
85 // Presently the node order is
86 // 1) all the roots of cells with 0 or 1 sid (no backbone involved)
87 // 2) all the sid0 when there are two sids in a cell. 1) + 2) = rootnodecount
88 // 3) the interior backbone nodes (does not include sid0 or sid1 nodes)
89 // 4) all the sid1 nodes
90 // 5) all remaining nodes
91 // backbone_begin is the index for 2).
92 // backbone_interior_begin is one more than the last index for 2)
93 // backbone_sid1_begin is the index for 4)
94 // backbone_end is the index for 5.
95 
96 // Modification to handle short backbones
97 // Numerical accuracy/stability suffers when backbones are electrically
98 // short. Unfortunately, it is impossible to divide many 3-d reconstructed
99 // cells with any kind of load balance without using them. Also,
100 // unfortunately, it is probably not possible to solve the problem
101 // in general. However we can extend, without introducing
102 // too much parallel inefficiency, the implementation to allow an
103 // arbitrarily short backbone treated as a single node, as long as it
104 // does not connect to a short backbone on another machine. e.g we can
105 // have alternating short and long sausages but not a sequence of
106 // contiguous short sausages. The idea is to exchange backbone info
107 // in two stages. First receive the info that short backbones need
108 // (by definition only from long backbones), compute the exact D,RHS
109 // backbone end points (only a 2x2 matrix solve), and then send the
110 // exact short backbone info. matrix_exchange carries out the
111 // augmented communication. But we need to distinguish short from
112 // long backbones since for short ones we need to factor out the
113 // first part of bksub_backbone and do it in the middle of
114 // matrix_exchange (as well as avoid doing that twice in bksub_backbone).
115 // We can accomplish that by introducing the index
116 // backbone_long_begin between the backbone_begin and backbone_interior_begin
117 // indices. Then in matrix exchange we do backbone_begin to
118 // backbone_long_begin - 1 , and in bksub_backbone, the first part is
119 // from backbone_long_begin to backbone_interior_begin - 1.
120 
121 // Modification to handle any number of backbones, short or long, exactly.
122 // In this case, it is not necessary to consider numerical stability/accuracy.
123 // However a tree matrix with rank equal to the number of distinct sid for
124 // an entire parallelized cell must be solved between the send and receive
125 // stages for that cell. The main issue to be resolved is: which machine receives
126 // the matrix information. ie. a single sid subtree sends d and rhs to the
127 // (rthost)machine that gaussian eliminates the reduced tree
128 // matrix and a two sid backbone subtree sends the two d, a, b, and two rhs
129 // to rthost. After rthost solves the matrix (triangualize and back substitute)
130 // the answer for each sid is sent back to the machines needing it.
131 // Clearly, some savings results if rthost happens to be one of the machines
132 // with a 2 sid backbone. Also, it is probably best if a machine serves
133 // as rthost to as few cells as possible.
134 
135 // Modification to handle multiple subtrees with same sid on same machine.
136 // This is very valuable for load balancing. It demands a backbone style
137 // of 2 (using reduced tree matrices) and we allow user specified host
138 // for the reduced tree. The exact solution using reduced trees has
139 // proved so much more useful than the long/short backbone methods
140 // that we should consider removing the code
141 // for those others and implement only the reduced tree method.
142 
143 // Area handling prior to ReducedTree extension was and mostly still is:
144 // 1) sending long to short, long, reduced on the send side multiplies
145 // the send buffer (D and RHS components) by area (if non-zero area node)
146 // from 0 to iarea_short_long using area_node_indices and buf_area_indices.
147 // 2) receive what is needed by short backbone (and reduced tree) and the
148 // receive buffer for short backbong divides by area from iarea_short_long
149 // to narea using aread_node_indices and buf_area_indices. (not needed by
150 // reduced tree but the local D and RHS will have to be multiplied by the
151 // node area when filling the ReducedTree. Also the sid1A and sid1B terms
152 // will have to be area multiplied both on the ReducedTree host
153 // and when backbones are sent to the ReducedTree host
154 // 3) The short backbone is solved consistent with its original node area
155 // (and the ReducedTree is solved with all its equation having been multiplied
156 // by area.
157 // 4) Note that solving the short and reduced tree gives the answer which
158 // does not have to be scaled but we multiply the D and RHS by 1e30 so
159 // those equation are v = RHS/D regardless of area or coupling. So no need
160 // to divide by area on the short or reduced tree machines and it would not
161 // matter if we did so or not.
162 // 5) all remaining receives take place and what is received gets divided
163 // by the node area (0 to iarea_short_long). As mentioned above, this does
164 // not affect the result that was sent from the short or reduced tree
165 // host (since D and RHS were scaled by 1e30).
166 // So... Anything sent to reduced trees from another machine needs to get its
167 // info multiplied by the node area. This is sid0 D and RHS and if there is
168 // a sid1 then that D and RHS as well as sid1A and sid1B.
169 // Also... Anything rmapped into a reduced tree from the reduced tree machine
170 // needs to get its rmap info multiplied by the node area ( in fact we can
171 // do that in place before calling fillrmap since those values get the result
172 // *1e30 returned into them).
173 // We do the old methods (backbone style 0 and 1) completely segregated from
174 // the new reducedtree method by introducing a vector parallel to
175 // nodeindex_buffer_ called nodeindex_rthost_ that contains the hostid for the
176 // reduced tree host associated with that sid. Then -1 means old style and
177 // >= 0 means reducedtree style.
178 
179 // the implementation so far has turned out to be too brittle with regard
180 // to unimportant topology changes in the sense of new point processes or
181 // anything that ends up calling multisplit_v_setup. The experienced
182 // cause of this is because MultiSplitTransferInfo holds pointers into
183 // the sid1A and sid1B off diag elements and those arrays are freed
184 // and reallocated when multisplit_v_setup is called. Although
185 // exchange_setup cries out for a simplified and
186 // more efficient implementation, for now, opportunistically reset the
187 // pointers in the MultiSplitTransferInfo at the end multisplit_v_setup
188 
189 
190 struct Area2Buf {
191  int inode;
192  int n; // using only for transfer to ReducedTree on another
193  int ibuf[3]; // machine so n is 2 or 3
194  double adjust_rhs_; // for cvode
195  MultiSplit* ms; // for recalculating ibuf pointers after v_setup
196 };
197 struct Area2RT {
198  int inode;
199  int n; // 2 or 3
200  double* pd[3];
201  double adjust_rhs_; // for cvode
202  MultiSplit* ms; // for recalculating pd pointers after v_setup
203 };
204 
205 using Int2IntTable = std::unordered_map< int, int>;
206 
207 // d and rhs keep getting reorderd according to thread cache efficiency
208 // so we need to retain some logical info in order to update the
209 // pointer lists in the Reduced tree. Prior to this comment, this was
210 // done only for sid1A and sid1B near the end of v_setup. That is ok
211 // since sid1A and sid1B are allocated by v_setup. But d and rhs are
212 // filled into the Node* after return from v_setup.
213 
214 class ReducedTree {
215 public:
216  ReducedTree(MultiSplitControl*, int rank, int mapsize);
217  virtual ~ReducedTree();
218  void solve();
219  void nocap();
220 //private:
221 public:
222  void gather();
223  void scatter();
224 
226  int n;
227  int* ip;
228  double* rhs;
229  double* d;
230  double* a;
231  double* b;
232  int n2, n4, nmap;
233 
234 public:
235  // next implementation, exploit the fact that send buffer can be
236  // smaller than the receive buffer since we only need to send back
237  // the answer in RHS. Now we are sending back RHS, D,
238  // and even the offdiags.
239  double **smap, **rmap;
240  int *ismap, *irmap;
241  int nsmap, irfill;
242 
243  int* rmap2smap_index; // needed for nocap when cvode used
244  int* nzindex; // needed for nocap when cvode used
245  double* v; // needed for nocap when cvode used
246 
247  void reorder(int j, int nt, int* mark, int* all_bb_relation, int* allsid);
248  std::unique_ptr<Int2IntTable> s2rt{new Int2IntTable()}; // sid2rank table
249  void fillrmap(int sid1, int sid2, double* pd);
250  void fillsmap(int sid, double* prhs, double* pdiag);
251  void pr_map(int, double*);
252 };
253 
254 class MultiSplit {
255 public:
256  Node* nd[2];
257  int sid[2];
258  int backbone_style; // 0 = no, 1 = yes, 2 = reducedtree (exact)
259  int rthost; // nrnmpi_myid where reducedtree will be solved
260  // the problem with using backsid_ to find the index now is that
261  // the value may not be unique since several backbones can be
262  // connected at the same sid on this machine. So store the index into
263  // the backsid) array.
265  // and need the thread id as well
266  int ithread;
267  // reduced tree pointers to d and rhs can become invalid and
268  // need to be restored. According to the exchange_setup that calls
269  // ReducedTree::fillrmap, the order is nd[0]->rhs, nd[0]->d, nd[1]->rhs
270  // nd[1]->d.
274 };
275 
276 // note: the tag_ below was added in case this machine interacts with
277 // another via more than one message type. i.e combinations of short<->long,
278 // reduced<->long, and long<->long backbone interactions.
279 // The user should avoid this
280 // eventuality and in fact we might not allow it.
281 
282 // mpi transfer information
284  int host_; // host id for send-receive
285  int nnode_; // number of nodes involved
286  int* nodeindex_; // v_node indices of those nodes. Pointer into nodeindex_buffer_
287  int* nodeindex_th_; // thread for above
288  int nnode_rt_; // number of off diag elements involved
289  int* nd_rt_index_; // v_node indices of offdiag. Not pointer into nodeindex_buffer. Needed for area2buf
290  int* nd_rt_index_th_; // thread for above
291  double** offdiag_; // pointers to sid1A or sid1B off diag elements
292  int* ioffdiag_; // indices of above, to recalculate offdiag when freed
293  int size_; // 2*nnode_ + nnode_rt_ doubles needed in buffer
294  int displ_; // displacement into trecvbuf_
295  void* request_; // MPI_Request
296  int tag_; // short<->long, long<->long, subtree->rthost, rthost->subtree
297  int rthost_; // host id where the reduced tree is located (normally -1)
298 };
299 
300 using MultiSplitTable = std::unordered_map< Node*, MultiSplit*>;
301 declarePtrList(MultiSplitList, MultiSplit)
302 implementPtrList(MultiSplitList, MultiSplit)
303 
304 #include <multisplitcontrol.h>
306 
307 void nrnmpi_multisplit(Section* sec, double x, int sid, int backbone_style) {
308  if (!msc_) {
309  msc_ = new MultiSplitControl();
310  }
311  msc_->multisplit(sec, x, sid, backbone_style);
312 }
313 
315  narea2buf_ = narea2rt_ = 0;
316  area2buf_ = 0;
317  area2rt_ = 0;
318  nthost_ = 0;
319  ihost_reduced_long_ = ihost_short_long_ = 0;
320  msti_ = 0;
321  tbsize = 0;
322  ndbsize = 0;
323  trecvbuf_ = 0;
324  tsendbuf_ = 0;
325  nodeindex_buffer_ = 0;
326  nodeindex_buffer_th_ = 0;
327  nodeindex_rthost_ = 0;
328  narea_ = 0;
329  iarea_short_long_ = 0;
330  buf_area_indices_ = 0;
331  area_node_indices_ = 0;
332 
333  nrtree_ = 0;
334  rtree_ = 0;
335 
336 
337  multisplit_list_ = 0;
338  nth_ = 0;
339  mth_ = 0;
340 }
341 
343 }
344 
346  backbone_begin = backbone_long_begin = backbone_interior_begin = 0;
347  backbone_sid1_begin = backbone_long_sid1_begin = backbone_end = 0;
348  sid1A = sid1B = 0;
349  sid0i = 0;
350 
351  nbackrt_ = 0;
352  backsid_ = 0;
353  backAindex_ = 0;
354  backBindex_ = 0;
355  i1 = i2 = i3 = 0;
356 }
357 
359  del_sidA();
360 }
361 
362 void MultiSplitControl::multisplit(Section* sec, double x, int sid, int backbone_style) {
363 #if 0
364  if (sid > 1000) { pexch(); return; }
365  if (sid >= 1000) { pmat(sid>1000); return; }
366 #endif
367  if (sid < 0) {
368  nrn_cachevec(1);
369  if (classical_root_to_multisplit_) {
373  }
374  exchange_setup(); return;
375  }
377  if (backbone_style != 2) {
378  hoc_execerror("only backbone_style 2 is now supported", 0);
379  }
380  if (!classical_root_to_multisplit_) {
381  classical_root_to_multisplit_.reset(new MultiSplitTable());
382  classical_root_to_multisplit_->reserve(97);
383  multisplit_list_ = new MultiSplitList();
384  }
385  Node* nd = node_exact(sec, x);
386  if (tree_changed) { setup_topology(); }
387 // printf("root of %s(%g) ", secname(sec), x);
388  Node* root;
389  for (sec = nd->sec; sec; sec = sec->parentsec) {
390  root = sec->parentnode;
391  }
392  assert(root);
393 // printf("is %s\n", secname(root->sec));
394  MultiSplit* ms;
395  const auto& msiter = classical_root_to_multisplit_->find(root);
396  if (msiter != classical_root_to_multisplit_->end()) {
397  ms = msiter->second;
398  if (backbone_style == 2) {
399  if(ms->backbone_style != 2) {
400 hoc_execerror("earlier call for this cell did not have a backbone style = 2", 0);
401  }
402  }else if (backbone_style == 1) {
403  ms->backbone_style = 1;
404  }
405  ms->nd[1] = nd;
406  ms->sid[1] = sid;
407  if (ms->sid[1] == ms->sid[0]) {
408  char s[100];
409  sprintf(s, "two sid = %d at same point on tree rooted at", sid);
410  hoc_execerror(s, secname(root->sec));
411  }
412  }else{
413  ms = new MultiSplit();
414  ms->backbone_style = backbone_style;
415  ms->rthost = -1;
416  ms->nd[0] = nd;
417  ms->nd[1] = 0;
418  ms->sid[0] = sid;
419  ms->sid[1] = -1;
420  ms->back_index = -1;
421  ms->ithread = -1;
422  ms->rt_ = 0;
423  ms->rmap_index_ = -1;
424  ms->smap_index_ = -1;
425  (*classical_root_to_multisplit_)[root] = ms;
426  multisplit_list_->append(ms);
427  }
428 }
429 
431  if (sid1A) {
432  delete [] sid1A;
433  delete [] sid1B;
434  delete [] sid0i;
435  sid1A = 0; sid1B = 0; sid0i = 0;
436  }
437  if (nbackrt_) {
438  delete [] backsid_;
439  delete [] backAindex_;
440  delete [] backBindex_;
441  nbackrt_ = 0;
442  }
443 }
444 
446  int i;
447  if (nrtree_) {
448  for (i=0; i < nrtree_; ++i) {
449  delete rtree_[i];
450  }
451  delete [] rtree_;
452  nrtree_ = 0;
453  }
454  if (msti_) {
455  for (i=0; i < nthost_; ++i) {
456  if (msti_[i].nnode_rt_) {
457  delete [] msti_[i].nd_rt_index_;
458  delete [] msti_[i].nd_rt_index_th_;
459  delete [] msti_[i].offdiag_;
460  delete [] msti_[i].ioffdiag_;
461  }
462  }
463  delete [] msti_;
464  msti_ = 0;
465  if (nodeindex_buffer_) {
466  delete [] nodeindex_buffer_;
467  delete [] nodeindex_buffer_th_;
468  delete [] nodeindex_rthost_;
469  }
470  nodeindex_buffer_ = 0;
471  nodeindex_buffer_th_ = 0;
472  nodeindex_rthost_ = 0;
473  if (trecvbuf_) {
474  delete [] trecvbuf_;
475  delete [] tsendbuf_;
476  }
477  trecvbuf_ = 0; tsendbuf_ = 0;
478  if (narea_) {
479  delete [] buf_area_indices_;
480  delete [] area_node_indices_;
481  buf_area_indices_ = 0;
482  area_node_indices_ = 0;
483  narea_ = 0;
484  }
485  if(narea2buf_) {
486  // at present, statically 3 since only used for ReducedTree
487 // for (i=0; i < narea2buf_; ++i) {
488 // delete [] area2buf_->ibuf;
489 // }
490  delete [] area2buf_;
491  area2buf_ = 0; narea2buf_ = 0;
492  }
493  if(narea2rt_) {
494  delete [] area2rt_;
495  area2rt_ = 0; narea2rt_ = 0;
496  }
497  }
498 }
499 
501  if (msc_) {
502  msc_->multisplit_clear();
504  }
505 }
506 
508 //printf("nrnmpi_multisplit_clear()\n");
509  int i;
512  for (i=0; i < nth_; ++i) {
513  mth_[i].del_sidA();
514  }
515  if (mth_) {
516  delete [] mth_;
517  mth_ = 0;
518  }
519  nth_ = 0;
520  del_msti();
521  if (classical_root_to_multisplit_) {
522  MultiSplit* ms;
523  for(const auto& mspair : *classical_root_to_multisplit_) {
524  delete mspair.second;
525  }
526  classical_root_to_multisplit_.reset();
527  delete multisplit_list_;
528  multisplit_list_ = 0;
529  }
530 }
531 
532 // in the incremental development of exchange_setup from only long to short
533 // to reduced tree we end up with many iterations over all pc.multisplit
534 // calls. ie. every machines sid info. If the performance becomes
535 // visible relative to the total setup time then one can replace all those
536 // iterations by preserving only the sid graph info that is needed by each
537 // machine. A sid graph, tree, would contain a list of sid's and a list of
538 // machines. Then there would be a map of sid2graph. This list and map
539 // could be constructed with a single iteration. ie. when an edge is seen
540 // the graphs of the two sids would merge. Then one could throw away all
541 // graphs that have no sid on this machine. I would expect an order of
542 // magnitude performance improvement in exchange_setup.
543 
545  int i, j, k;
546  del_msti(); //above recalc_diam so multisplit_v_setup does not
547  // attempt to fill offdiag_
548  if (diam_changed) {
549  recalc_diam();
550  }
551 
552  // how many nodes are involved on this machine
553  // all nodes are distinct. For now we independently assert that
554  // all the sid on this machine are distinct. So when we get an
555  // sid from another machine it does not have to be directed to
556  // more than one place. So, just like splitcell, we cannot test
557  // a multisplit on a single machine.
558  // When sids are not distinct, in the context of reduced trees,
559  // there is no problem with respect to sending sid info to
560  // multiple places since each sid info gets added to the reduced
561  // tree and the reduced tree answer gets sent back to each sid.
562  // So there is not a lot of special code involved with multiple
563  // pieces on the same host, since each piece <-> reduced tree
564  // independently of where the piece and reduced tree are located.
565  int n = 0;
566  int nsbb = 0; // number of short backbone sid's
567  int nwc = 0; // number of backbonestyle=2 nodes.
568  if (classical_root_to_multisplit_) {
569  MultiSplit* ms;
570  for (i=0; i < multisplit_list_->count(); ++i) {
571  ms = multisplit_list_->item(i);
572  ++n;
573  if (ms->nd[1]) { ++n; }
574  if (ms->backbone_style == 2) {
575  ++nwc;
576  if (ms->nd[1]) { ++nwc; }
577  }else if (ms->backbone_style == 1) {
578  if (ms->nd[1]) {
579  nsbb += 2;
580  }else{ // shouldn't be set anyway
581  ms->backbone_style = 0;
582  }
583  }
584  }
585  }
586 //printf("%d n=%d nsbb=%d nwc=%d\n", nrnmpi_myid, n, nsbb, nwc);
587  if (nrnmpi_numprocs == 1 && n == 0) { return; }
588  // how many nodes are involved on each of the other machines
589  int* nn = new int[nrnmpi_numprocs];
590  if (nrnmpi_use) {
591  nrnmpi_int_allgather(&n, nn, 1);
592  }else{
593  nn[0] = n;
594  }
595 //if (nrnmpi_myid==0) for (i=0; i < nrnmpi_numprocs;++i) printf("%d %d nn=%d\n", nrnmpi_myid, i, nn[i]);
596  // what are the sid's on this machine
597  int* sid = 0;
598  int* inode = 0;
599  int* threadid = 0;
600  // an parallel array back to MultiSplit
601  MultiSplit** vec2ms = new MultiSplit*[n];
602  // following used to be is_ssb (is sid for short backbone)
603  // now we code that as well as enough information for the
604  // backbone_style==2 case to figure out the reduced tree matrix
605  // so 0 means backbone_style == 0, 1 means backbone_style == 1
606  // and > 1 means backbone_style == 2 and the value is
607  // other backbone sid+3 (so a value of 2 means that
608  // it is a single sid subtree and a value of 3 means that the
609  // other backbone sid is 0.
610  int* bb_relation = 0;
611  if (n > 0) {
612  sid = new int[n];
613  inode = new int[n];
614  bb_relation = new int[n];
615  threadid = new int[n];
616  }
617  if (classical_root_to_multisplit_) {
618  i = 0;
619  MultiSplit* ms;
620  for (int ii=0; ii < multisplit_list_->count(); ++ii) {
621  ms = multisplit_list_->item(ii);
622  sid[i] = ms->sid[0];
623  inode[i] = ms->nd[0]->v_node_index;
624  threadid[i] = ms->ithread;
625  bb_relation[i] = ms->backbone_style;
626  vec2ms[i] = ms;
627  ++i;
628  if (ms->nd[1]) {
629  sid[i] = ms->sid[1];
630  inode[i] = ms->nd[1]->v_node_index;
631  threadid[i] = ms->ithread;
632  bb_relation[i] = ms->backbone_style;
633  if (ms->backbone_style == 2) {
634  bb_relation[i-1] += 1 + sid[i];
635  bb_relation[i] += 1 + sid[i-1];
636  }
637  vec2ms[i] = ms;
638  ++i;
639  }
640  }
641  }
642 //for (i=0; i < n; ++i) { printf("%d %d sid=%d inode=%d %s %d\n", nrnmpi_myid, i, sid[i], inode[i], secname(v_node[inode[i]]->sec), v_node[inode[i]]->sec_node_index_);}
643  // what are the sid's and backbone status on each of the other machines
644  // first need a buffer for them and their offsets
645  int* displ = new int[nrnmpi_numprocs+1];
646  displ[0] = 0;
647  for (i = 0; i < nrnmpi_numprocs; ++i) {
648  displ[i+1] = displ[i] + nn[i];
649  }
650  int nt = displ[nrnmpi_numprocs]; // total number of (distinct) nodes
651  int* allsid = new int[nt];
652  int* all_bb_relation = new int[nt];
653  if (nrnmpi_use) {
654  nrnmpi_int_allgatherv(sid, allsid, nn, displ);
655  nrnmpi_int_allgatherv(bb_relation, all_bb_relation, nn, displ);
656  }else{
657  for (i=0; i < n; ++i) {
658  allsid[i] = sid[i];
659  all_bb_relation[i] = bb_relation[i];
660  }
661  }
662  if (!n) {
663  delete [] allsid;
664  delete [] all_bb_relation;
665  delete [] displ;
666  delete [] nn;
667  delete [] vec2ms;
668  errno = 0;
669  return;
670  }
671 
672  // now we need to analyze
673  // how many machines do we need to communicate with
674 
675  // note that when long sid is connected to 1 or more long sid
676  // and also a short backbone sid that there is no need to send
677  // or receive info from the long sid because the short sid sends
678  // back the answer.
679 
680  // A similar communication pattern holds for the backbone_style==2
681  // case. i.e. a star instead of all2all. The only problem is
682  // which host is the center of the star. The following
683  // code for backbone_style < 2 is almost unchanged but
684  // backbone_style >=2 issue handling is inserted as needed.
685 
686  // there should not be that many sid on one machine. so just use
687  // linear search
688 
689  // for an index into the all... vectors, mark[index] is the index
690  // into this hosts vectors of the corresponding sid. It gets slowly
691  // transformed into a communication pattern in the sense that
692  // mark[index] = -1 means no communication will take place between
693  // this host and the host associated with the index.
694  // For reduced tree purposes, we overload the mark semantics so
695  // that mark[index] points to one (of the possibly two) sid on the
696  // same cell. ie. all the sid on other cpus that are part of the
697  // whole cell point to that sid.
698 
699  // For backbone style 2,
700  // when there are several pieces on the same host with the same
701  // sid, then the mark points to the principle pieces sid. i.e all
702  // the sid on this cpu that are part of the whole cell also point
703  // to that sid.
704 
705  int* mark = new int[nt];
706  int* connects2short = new int[n];
707  for (i = 0; i < n; ++i) {// so we know if we will be sending to
708  connects2short[i] = 0; // a short backbone.
709  }
710  for (i=0; i < nt; ++i) {
711  mark[i] = -1;
712  for (j=0; j < n; ++j) {
713  if (allsid[i] == sid[j]) {
714  // here we can errcheck the case of 2 on one
715  // machine and a different style for the same
716  // sid on another machine. If this passes on
717  // all machines then a single cell is necessarily
718  // consistent with regard to all sids for it
719  // being backbone_style == 2
720  if ((bb_relation[j] >= 2) != (all_bb_relation[i] >= 2)) {
721 hoc_execerror("backbone_style==2 inconsistent between hosts for same sid", 0);
722  }
723 
724  if (all_bb_relation[i] < 2) {
725  mark[i] = j;
726  if (all_bb_relation[i] == 1) {
727  connects2short[j] = 1;
728  }
729  }
730  }
731  }
732  }
733 
734  // but we do not want to mark allsid on this machine
735  for (i=displ[nrnmpi_myid]; i < displ[nrnmpi_myid]+n; ++i) {
736  mark[i] = -1;
737  }
738  // but note that later when we mark the reduced tree items we
739  // do leave the marks for allsid on this machine (but the marks
740  // have a slightly different meaning
741 
742  // undo all the marks for long to long if there is a long to short
743  // but be careful not to undo the short to long
744  for (i=0; i < nt; ++i) {
745  if (mark[i] >= 0 && connects2short[mark[i]]
746  && all_bb_relation[i] == 0 && bb_relation[mark[i]] == 0) {
747  mark[i] = -1;
748  }
749  }
750  // make sure no short to short connections
751  for (i = 0; i < nt; ++i) {
752  if (mark[i] >= 0) {
753  if (bb_relation[mark[i]] == 1 && all_bb_relation[i] == 1) {
754 hoc_execerror("a short to short backbone interprocessor connection exists", 0);
755  }
756  }
757  }
758 
759  // right now, the communication pattern represented by mark is
760  // bogus for backbone_style == 2 and they are all -1.
761  // We want to adjust mark so all the sid on
762  // a single cell point to one of the sid on this cell
763  // The following uses an excessive number of iterations
764  // over the all... vectors (proportional to number of distinct sids
765  // on a cell and should be made more efficient
766  for (j = 0; j < n; ++j) {
767  if (bb_relation[j] >= 2) {
768  reduced_mark(j, sid[j], nt, mark, allsid, all_bb_relation);
769  // note the above will also mark both this host sids
770  // so when the other one comes around here it will
771  // mark nothing.
772  }
773  }
774 
775 #if 0
776 for (i=0; i < nt; ++i) { printf("%d %d allsid=%d all_bb_relation=%d mark=%d\n",
777 nrnmpi_myid, i, allsid[i], all_bb_relation[i], mark[i]);}
778 #endif
779 
780  // it is an error if there are two pieces of the same cell on the
781  // same host. All we need to do is make sure the mark for this
782  // cpu are unique. i.e belong to different cells
783  // This is no longer an error. For backbonestyle 2, allowing
784  // multiple pieces of same cell on same cpu.
785  if (0) {
786  int* mcnt = new int[n];
787  for (j=0; j < n; ++j) { mcnt[j] = 0; }
788  for (i = displ[nrnmpi_myid]; i < displ[nrnmpi_myid]+n; ++i) {
789  if (all_bb_relation[i] >= 2 && mark[i] >= 0) {
790  mcnt[mark[i]] += 1;
791  if (all_bb_relation[i] > 2) { ++i; }
792  }
793  }
794  for (j=0; j < n; ++j) {
795  assert(mcnt[j] < 2);
796  }
797  delete [] mcnt;
798  }
799 
800  // for reduced trees, we now need to settle on which host will solve
801  // a reduced tree. If rthost is this one then this receives from
802  // and sends to every host that has a sid on the cell. If the rthost
803  // is another, then this sends and receives to rthost.
804 
805  // For now let it be the first host with two sid, and if there is
806  // no such host (then why are we using style 2?) then the first
807  // host with 1.
808  nrtree_ = 0;
809  int* rthost = new int[n];
810  ReducedTree** rt = new ReducedTree*[n];
811  for (j=0; j < n; ++j) { rthost[j] = -1; rt[j] = 0; }
812  if (nwc) {
813  MultiSplit* ms;
814  for (int ii=0; ii < multisplit_list_->count(); ++ii) {
815  ms = multisplit_list_->item(ii);
816  if (ms->backbone_style == 2) {
817  for (j = 0; j < n; ++j) {//find the mark value
818  if (sid[j] == ms->sid[0]) {
819  break;
820  }
821  }
822  ms->rthost = -1;
823  k = -1;
824  for (int ih=0; ih < nrnmpi_numprocs; ++ih) {
825  for (i=displ[ih]; i < displ[ih+1]; ++i) {
826  if (mark[i] == j) {
827  if (all_bb_relation[i] > 2) {
828  ms->rthost = ih;
829  break;
830  }
831  if (all_bb_relation[i] == 2) {
832  if (k == -1) {
833  k = ih;
834  }
835  }
836  }
837  }
838  if (ms->rthost != -1) { break; }
839  }
840  if (ms->rthost == -1) { ms->rthost = k; }
841  // only want rthost[j] for sid[0], not sid[1]
842  rthost[j] = ms->rthost;
843 //printf("%d sid0=%d sid1=%d rthost=%d\n", nrnmpi_myid, ms->sid[0], ms->sid[1], ms->rthost);
844  }
845  }
846  // it is important that only the principal sid has rthost != 0
847  // since we count them below, nrtree_, to allocate rtree_.
848  }
849 #if 0
850 for (j=0; j < n; ++j) {
851 printf("%d j=%d sid=%d bb_relation=%d rthost=%d\n", nrnmpi_myid,j, sid[j],
852 bb_relation[j], rthost[j]);
853 }
854 #endif
855  // there can be a problem when one host sends parts of two cells
856  // to another host due to cells not being in the same order with
857  // respect to NrnHashIterate. So demand that they are in the
858  // same order. Note this can also be used to assert that there
859  // are not two pieces on a host from the same cell.
860  // But we no longer use NrnHashIterate and the order is consistent
861  // because ordering is always with respect to the allgathered
862  // arrays of size nt.
863  if (0) {
864  for (i=0; i < nrnmpi_numprocs; ++i) {
865  int ix = -1;
866  int nj = displ[i+1];
867  for (j = displ[i]; j < nj; ++j) {
868  if (all_bb_relation[j] >= 2 && mark[j] >= 0 && rthost[mark[j]] == nrnmpi_myid) {
869  assert(mark[j] > ix);
870  ix = mark[j];
871  if (all_bb_relation[j] > 2) { ++j; }
872  }
873  }
874  }
875  }
876 
877  // allocate rtree_
878  for (j=0; j < n; ++j) {
879  if (rthost[j] == nrnmpi_myid) {
880  ++nrtree_;
881  }
882  }
883  if (nrtree_) {
884  i = 0;
885  rtree_ = new ReducedTree*[nrtree_];
886  for (j=0; j < n; ++j) {
887  if (rthost[j] == nrnmpi_myid) {
888  // sid to rank table
889  std::unique_ptr<Int2IntTable> s2rt{new Int2IntTable()};
890  s2rt->reserve(20);
891  int rank = 0; int mapsize = 0;
892  for (k=0; k < nt; ++k) {
893  if (mark[k] == j && all_bb_relation[k] >= 2) {
894  const auto& result = s2rt->insert({allsid[k], rank});
895  if (result.second) {
896  rank++;
897  }
898  if (all_bb_relation[k] == 2) {
899  mapsize += 2;
900  }else{
901  mapsize += 3;
902  }
903  }
904  }
905  rtree_[i] = new ReducedTree(this, rank, mapsize);
906  rt[j] = rtree_[i]; // needed for third pass below
907 //printf("%d new ReducedTree(%d, %d)\n", nrnmpi_myid, rank, mapsize);
908  // at this point the ReducedTree.s2rt is not in tree order
909  // (in fact we do not even know if it is a tree)
910  // so reorder. For a tree, we know there must be ReducedTree.n - 1
911  // edges.
912  rtree_[i]->s2rt = std::move(s2rt);
913  rtree_[i]->reorder(j, nt, mark, all_bb_relation, allsid);
914  ++i;
915  }
916  }
917  }
918 
919  // fill in the rest of the rthost[], mt->rthost, and rt
920  for (int ii=0, j=0; ii < multisplit_list_->count(); ++ii, ++j) {
921  MultiSplit* ms = multisplit_list_->item(ii);
922  if (ms->backbone_style == 2) {
923  int jj = mark[displ[nrnmpi_myid] + j];
924  ms->rthost = rthost[jj];
925  rthost[j] = ms->rthost;
926  rt[j] = rt[jj];
927  if (ms->nd[1]) { ++j; }
928  }
929  }
930 #if 0
931  for (int ii=0; ii < multisplit_list_->count(); ++ii) {
932  MultiSplit* ms = multisplit_list_->item(ii);
933  if (ms->backbone_style == 2) {
934 printf("%d sid0=%d sid1=%d rthost=%d\n", nrnmpi_myid, ms->sid[0], ms->sid[1], ms->rthost);
935  }
936  }
937 for (j=0; j < n; ++j) {
938 printf("%d j=%d sid=%d bb_relation=%d rthost=%d\n", nrnmpi_myid,j, sid[j],
939 bb_relation[j], rthost[j]);
940 }
941 #endif
942 
943  // count the reduced tree related nodes that have non-zero area
944  for (i=0; i < n; ++i) {
945  // remember rthost >= 0 only for sid[0]
946  if (rthost[i] >= 0) for (j=0; j < 2; ++j) {
947  NrnThread* _nt = nrn_threads + threadid[i];
948  Node* nd = _nt->_v_node[inode[i+j]];
949  if (nd->_classical_parent
950  && nd->sec_node_index_ < nd->sec->nnode-1) {
951  if (rthost[i] == nrnmpi_myid) {
952  ++narea2rt_;
953  }else{
954  ++narea2buf_;
955  }
956  }
957  if (bb_relation[i] == 2) { // sid[0] only
958  break;
959  }
960  if (j == 1) { ++i; }
961  }
962  }
963  if (narea2rt_) { area2rt_ = new Area2RT[narea2rt_]; }
964  if (narea2buf_) { area2buf_ = new Area2Buf[narea2buf_]; }
965  // can fill in all of the info for area2rt_ but only some for area2buf_
966  // actually, area2rt is the one used in practice since only the soma ever
967  // has a sid at a non-zero area node and the reduced tree will probably
968  // be on the machine with the soma.
969  narea2rt_ = narea2buf_ = 0;
970  for (i=0; i < n; ++i) {
971  // remember rthost >= 0 only for sid[0]
972  if (rthost[i] >= 0) for (j=0; j < 2; ++j) {
973  NrnThread* _nt = nrn_threads + threadid[i];
974  Node* nd = _nt->_v_node[inode[i+j]];
975  if (nd->_classical_parent
976  && nd->sec_node_index_ < nd->sec->nnode-1) {
977  if (rthost[i] == nrnmpi_myid) {
978  Area2RT& art = area2rt_[narea2rt_];
979  art.ms = vec2ms[i];
980  art.inode = inode[i+j];
981  art.n = 2;
982  art.pd[0] = &D(art.inode);
983  art.pd[1] = &RHS(art.inode);
984  if (bb_relation[i] > 2) {
985  art.n = 3;
986  k = vec2ms[i]->back_index;
987  MultiSplitThread& t = mth_[vec2ms[i]->ithread];
988  if (j == 0) {
989  art.pd[2] = t.sid1A + t.backAindex_[k];
990  }else{
991  art.pd[2] = t.sid1B + t.backBindex_[k];
992  }
993  }
994  ++narea2rt_;
995  }else{
996  Area2Buf& ab = area2buf_[narea2buf_];
997  ab.ms = vec2ms[i];
998  ab.inode = inode[i+j];
999  // can figure out how many but not
1000  // the indices into the send buffer.
1001  ab.n = 2;
1002  if (bb_relation[i] > 2) {
1003  ab.n = 3;
1004  }
1005  ++narea2buf_;
1006  }
1007  }
1008  if (bb_relation[i] == 2) { // sid[0] only
1009  break;
1010  }
1011  }
1012  }
1013 #if 0
1014 printf("%d narea2rt=%d narea2buf=%d\n", nrnmpi_myid, narea2rt_, narea2buf_);
1015 for (i = 0; i < narea2rt_; ++i) {
1016  Area2RT& art = area2rt_[i];
1017  printf("%d area2rt[%d] inode=%d n=%d thread=%d\n", nrnmpi_myid, i, art.inode, art.n, art.ms->ithread);
1018 }
1019 for (i = 0; i < narea2buf_; ++i) {
1020  Area2Buf& ab = area2buf_[i];
1021  printf("%d area2buf[%d] inode=%d n=%d\n", nrnmpi_myid, i, ab.inode, ab.n);
1022 }
1023 #endif
1024 #if 0
1025 if (nrnmpi_myid == 0) {
1026 for (i=0; i < n; ++i) {
1027  printf("%d %d sid=%d bbrelation=%d rthost=%d rt=%p\n", nrnmpi_myid, i, sid[i], bb_relation[i], rthost[i], rt[i]);
1028 }
1029 for (i=0; i < nrnmpi_numprocs+1; ++i) {
1030  printf("%d displ[%d]=%d\n", nrnmpi_myid, i, displ[i]);
1031 }
1032 for (i=0; i < nt; ++i) {
1033  printf("%d %d allsid=%d mark=%d all_bb_relation=%d\n", nrnmpi_myid, i, allsid[i], mark[i], all_bb_relation[i]);
1034 }}
1035 #endif
1036  // it is best if the user arranges things so that machine -> machine
1037  // has only one type of connection, i.e long->short, long->long,
1038  // or short->long since that means only one message required.
1039  // But for generality we program for the possibility of needing
1040  // two message types (long->short and short->long are the same
1041  // type and distinct from long->long)
1042  // Actually we do need 3 because one cannot mix long->short
1043  // and long->long in the same message because they need
1044  // different tags.
1045  // we also calculate the indices for the types
1046  // we need to have separate tags for long <-> reduced trees
1047  nthost_ = 0;
1048 // int ndbsize, tbsize;
1049  ndbsize = 0;
1050  tbsize = 0;
1051 
1052  for (i=0; i < nrnmpi_numprocs; ++i) {
1053  int nh = 0; int type = 0; int nh1 = 0;
1054  for (j=displ[i]; j < displ[i+1]; ++j) {
1055  if (mark[j] >= 0) {
1056  // how many distinct all_bb_relation<2: 1,2,3?
1057  if (bb_relation[mark[j]] == 1) { // short<->long
1058  assert(all_bb_relation[j] == 0);
1059  type |= 04;
1060  ++ndbsize; // space for node index
1061  tbsize += 2; // d and rhs
1062  }else if (all_bb_relation[j] == 1) { //long<->short
1063  type |= 01;
1064  ++ndbsize;
1065  tbsize += 2;
1066  }else if (all_bb_relation[j] < 2) { // long<->long
1067  type |= 02;
1068  ++ndbsize;
1069  tbsize += 2;
1070  }else if (all_bb_relation[j] >= 2) {
1071  int rth = rthost[mark[j]];
1072  if (rth == nrnmpi_myid) {
1073  if (i != nrnmpi_myid) {
1074  //this is the rthost
1075  //reducedtree->long
1076  type |= 010;
1077  // no node buffer since all
1078  // info <-> ReducedTree
1079  // ++ndbsize;
1080  tbsize += 2;
1081  if (all_bb_relation[j] > 2) {
1082  tbsize += 1;
1083  }
1084  }
1085  }else if (i == nrnmpi_myid) {
1086  // only this to the rthost
1087  // long->reducedtree
1088  nh1 += 1;
1089  // may not be distinct hosts
1090  // be careful not to count twice
1091  for (int jj = displ[i]; jj < j; ++jj) {
1092  if (rth == rthost[mark[jj]]) {
1093  nh1 -= 1;
1094  break;
1095  }
1096  }
1097  ++ndbsize;
1098  tbsize += 2; // off diag element
1099  if (all_bb_relation[j] > 2) {
1100  tbsize += 1;
1101  }
1102  }
1103  }
1104  }
1105  }
1106  nh = (type&1) + ((type/2)&1) + ((type/4)&1) + ((type/010)&1);
1107  nthost_ += nh + nh1;
1108 //printf("%d type=%d %d %d %d %d %d nh=%d nthost_=%d\n", nrnmpi_myid, type, type&1, (type/2)&1, (type/4)&1, (type/010)&1, (type/020)&1, nh, nthost_);
1109  }
1110 //printf("%d x nthost_=%d\n", nrnmpi_myid, nthost_);
1111  msti_ = new MultiSplitTransferInfo[nthost_];
1112  for (i=0; i < nthost_; ++i) { // two of these needed before rest of fill
1113  msti_[i].nnode_rt_ = 0;
1114  msti_[i].nd_rt_index_ = 0;
1115  msti_[i].nd_rt_index_th_ = 0;
1116  msti_[i].offdiag_ = 0;
1117  msti_[i].ioffdiag_ = 0;
1118  msti_[i].rthost_ = -1;
1119  }
1120  if (ndbsize) { // can be 0 if rthost
1121  nodeindex_buffer_ = new int[ndbsize];
1122  nodeindex_buffer_th_ = new int[ndbsize];
1123  nodeindex_rthost_ = new int[ndbsize];
1124  }
1125  for (i=0; i < ndbsize; ++i) {
1126  nodeindex_rthost_[i] = -1;
1127  }
1128 //printf("%d nthost_=%d ndbsize=%d tbsize=%d nrtree_=%d\n", nrnmpi_myid, nthost_, ndbsize, tbsize, nrtree_);
1129 //tbsize_=tbsize;
1130  if (tbsize) {
1131  trecvbuf_ = new double[tbsize];
1132  tsendbuf_ = new double[tbsize];
1133  }
1134  // We need to fill the msti_ array
1135  // in the order long<->short long<->reduced
1136  // followed by long<->long followed by reduced<->long
1137  // followed short<->long. // three distinct message tags
1138  nthost_ = 0;
1139  int mdisp = 0;
1140  k = 0;
1141 
1142  // first pass for the long<->short backbones
1143  for (i=0; i < nrnmpi_numprocs; ++i) {
1144  int b = 0;
1145  for (j=displ[i]; j < displ[i+1]; ++j) {
1146  if (mark[j] >= 0 && bb_relation[mark[j]] == 0 && all_bb_relation[j] == 1) {
1147  nodeindex_buffer_th_[k] = threadid[mark[j]];
1148  nodeindex_buffer_[k++] = inode[mark[j]];
1149 //printf("%d i=%d j=%d k=%d nthost=%d mark=%d inode=%d\n", nrnmpi_myid, i, j, k, nthost_, mark[j], inode[mark[j]]);
1150  ++b;
1151  }
1152  }
1153  if (b) {
1154  msti_[nthost_].displ_ = mdisp;
1155  msti_[nthost_].size_ = 2*b;
1156  mdisp += 2*b;
1157  msti_[nthost_].host_ = i;
1158  msti_[nthost_].nnode_ = b;
1159  msti_[nthost_].tag_ = 1;
1160  ++nthost_;
1161  }
1162  }
1163 
1164  // second pass for the long<->reducedtree
1165  // includes the communication of sid <-> reduced tree for the
1166  // non rthost cpus
1167  // first make a list of the rthost values, the size of that
1168  // list will be how many msti we need for this pass.
1169  int* tmphost = new int[n]; // cannot be more than this.
1170  int ntmphost = 0;
1171  i = nrnmpi_myid;
1172  for (j=displ[i]; j < displ[i+1]; ++j) {
1173  int j1 = mark[j];
1174  if (j1 >= 0 && bb_relation[j1] >= 2 && rthost[j1] != i) {
1175  tmphost[ntmphost++] = rthost[j1];
1176  for (int itmp = 1; itmp < ntmphost; ++itmp) {
1177  if (tmphost[itmp-1] == rthost[j1]) {
1178  ntmphost--;
1179  break;
1180  }
1181  }
1182  }
1183  }
1184 //printf("%d second pass ntmphost = %d\n", nrnmpi_myid, ntmphost);
1185  // now we can loop over that list and fill the msti
1186  for (int itmp = 0; itmp < ntmphost; ++itmp) {
1187  i = nrnmpi_myid;
1188  int b = 0;
1189  int br = 0;
1190  // only the ones on this machine and rthost not this machine
1191  // Note: rthost generally has different sid value
1192  // and number of sid than this machine
1193  for (j=displ[i]; j < displ[i+1]; ++j) {
1194  int jj = j - displ[i];
1195  int j1 = mark[j]; // point to sid[0] on this machine
1196  // skip if sid[1] of the rthost
1197 //printf("%d i=%d j=%d j1=%d bb_relation[j1]=%d rthost[j1]=%d\n",
1198 //nrnmpi_myid, i, j, j1, bb_relation[j1], rthost[j1]);
1199  if (j1 >= 0 && bb_relation[j1] >= 2
1200  && rthost[j1] == tmphost[itmp]) { // only the first has rthost
1201 //printf("%d sid=%d bb_relation=%d send to rthost=%d k=%d\n", nrnmpi_myid,
1202 //sid[j1], bb_relation[j1], rthost[j1], k);
1203  // mark[j] points to the index for sid0
1204  nodeindex_rthost_[k] = rthost[j1];
1205  nodeindex_buffer_th_[k] = threadid[j - displ[i]];
1206  nodeindex_buffer_[k++] = inode[j - displ[i]];
1207  ++b;
1208  // one or two?
1209  if (bb_relation[jj] > 2) {
1210 //int iii = i;
1211 {
1212 // except that msti_[nthost_].offdiag_ has not been allocated, this is a
1213 // good time to figure out the pointers to sid1A and sid1B and which is which
1214  MultiSplitTransferInfo& m = msti_[nthost_];
1215  int i;
1216  int* ix;
1217  int* ixth;
1218  double** od;
1219  int* iod;
1220  // start by assuming 2 and then increment by 2
1221  if (m.offdiag_) {
1222  ix = m.nd_rt_index_;
1223  ixth = m.nd_rt_index_th_;
1224  od = m.offdiag_;
1225  iod = m.ioffdiag_;
1226  m.nd_rt_index_ = new int[m.nnode_rt_ + 2];
1227  m.nd_rt_index_th_ = new int[m.nnode_rt_ + 2];
1228  m.offdiag_ = new double*[m.nnode_rt_ + 2];
1229  m.ioffdiag_ = new int[m.nnode_rt_ + 2];
1230  for (i=0; i < m.nnode_rt_; ++i) {
1231  m.nd_rt_index_[i] = ix[i];
1232  m.nd_rt_index_th_[i] = ixth[i];
1233  m.offdiag_[i] = od[i];
1234  m.ioffdiag_[i] = iod[i];
1235  }
1236  delete [] ix;
1237  delete [] ixth;
1238  delete [] od;
1239  delete [] iod;
1240  ix = m.nd_rt_index_ + m.nnode_rt_;
1241  ixth = m.nd_rt_index_th_;
1242  od = m.offdiag_ + m.nnode_rt_;
1243  iod = m.ioffdiag_ + m.nnode_rt_;
1244  }else{
1245  m.nd_rt_index_ = new int[2];
1246  m.nd_rt_index_th_ = new int[2];
1247  m.offdiag_ = new double*[2];
1248  m.ioffdiag_ = new int[2];
1249  ix = m.nd_rt_index_;
1250  ixth = m.nd_rt_index_th_;
1251  od = m.offdiag_;
1252  iod = m.ioffdiag_;
1253  }
1254  // now fill in
1255  // sid0 is for sid1A[0] , sid1 is for sid1B[???]
1256  //sid0: nodeindex = inode[j1]
1257  // in multisplit_v_setup, we created two parallel vectors of size
1258  // nbackrt_ (number of backbones sending to reduced tree).
1259  // backsid_ is sid0, and
1260  // backAindex_ corresponding sid1A index for sid0
1261  // backBindex_ sid1B index for sid1
1262  // Should not be too many backbones on this machine so linear
1263  // search should be ok. BUG again. i is MultiSplit.back_index and
1264  // can determine MultiSplit from j1.
1265  i = vec2ms[jj]->back_index;
1266  MultiSplitThread& t = mth_[vec2ms[jj]->ithread];
1267  assert(sid[jj] == t.backsid_[i]);
1268  ix[0] = inode[jj];
1269  ix[1] = inode[jj + 1];
1270  ixth[0] = vec2ms[jj]->ithread;
1271  ixth[1] = vec2ms[jj]->ithread;
1272  od[0] = t.sid1A + t.backAindex_[i];
1273  od[1] = t.sid1B + t.backBindex_[i];
1274  iod[0] = t.backAindex_[i];
1275  iod[1] = t.backBindex_[i];
1276 #if 0
1277 printf("%d offdiag nbrt=%d iii=%d i=%d j1=%d jj=%d sid=%d ix = %d %d back = %d %d\n",
1278 nrnmpi_myid, nbackrt_, iii, i, j1, jj, sid[j1], ix[0], ix[1], t.backAindex_[i], t.backBindex_[i]);
1279 #endif
1280  m.nnode_rt_ += 2;
1281 }
1282  nodeindex_rthost_[k] = rthost[j1];
1283  ++j; ++jj;
1284  nodeindex_buffer_th_[k] = threadid[jj];
1285  nodeindex_buffer_[k++] = inode[jj];
1286  ++b;
1287  br += 2;
1288  }
1289  }
1290  }
1291  if (b || br) {
1292  msti_[nthost_].displ_ = mdisp;
1293  msti_[nthost_].size_ = 2*b + br;
1294  mdisp += 2*b + br;
1295  msti_[nthost_].host_ = tmphost[itmp];
1296  msti_[nthost_].rthost_ = tmphost[itmp];
1297  msti_[nthost_].nnode_ = b;
1298  msti_[nthost_].tag_ = 3;
1299  ++nthost_;
1300  }
1301  }
1302  delete [] tmphost;
1303 
1304  // third pass for the long->long backbones
1305  for (i=0; i < nrnmpi_numprocs; ++i) {
1306  int b = 0;
1307  for (j=displ[i]; j < displ[i+1]; ++j) {
1308  if (mark[j] >= 0 && bb_relation[mark[j]] == 0 && all_bb_relation[j] == 0) {
1309  nodeindex_buffer_th_[k] = threadid[mark[j]];
1310  nodeindex_buffer_[k++] = inode[mark[j]];
1311 //printf("%d i=%d j=%d k=%d nthost=%d mark=%d inode=%d\n", nrnmpi_myid, i, j, k, nthost_, mark[j], inode[mark[j]]);
1312  ++b;
1313  }
1314  }
1315  if (b) {
1316  msti_[nthost_].displ_ = mdisp;
1317  msti_[nthost_].size_ = 2*b;
1318  mdisp += 2*b;
1319  msti_[nthost_].host_ = i;
1320  msti_[nthost_].nnode_ = b;
1321  msti_[nthost_].tag_ = 2;
1322  ++nthost_;
1323  }
1324  }
1325 
1326  // fourth pass for the reducedtree->long backbones
1327  // the reduced tree <-> sids not on this host
1328  ihost_reduced_long_ = nthost_;
1329  for (i=0; i < nrnmpi_numprocs; ++i) {
1330  int b = 0;
1331  int br = 0;
1332  // not the ones on this machine and rthost this machine
1333  for (j=displ[i]; j < displ[i+1]; ++j) {
1334  int j1 = mark[j];
1335  if (j1 >= 0 && all_bb_relation[j] >= 2
1336  && rthost[j1] == nrnmpi_myid
1337  && rthost[j1] != i) {
1338 //printf("%d i=%d sid=%d bb_relation=%d send to rthost=%d rt=%p\n", nrnmpi_myid, i,
1339 //allsid[j], all_bb_relation[j], rthost[j1], rt[j1]);
1340  // don't care about the nodeindex, only
1341  // the ReducedTree.
1342 // fill in the ReducedTree map with RHS and D pointers to the sid[0] related receive buffer
1343 // and next iteration the sid[1]
1344 int ib = mdisp + 2*b;
1345 //printf("%d fill buf %d\n", nrnmpi_myid, ib);
1346 rt[j1]->fillrmap(allsid[j], -1, trecvbuf_ + ib+1); //exchange order is d, rhs
1347 rt[j1]->fillrmap(allsid[j], allsid[j], trecvbuf_ + ib);
1348 rt[j1]->fillsmap(allsid[j], tsendbuf_ + ib+1, tsendbuf_ + ib);
1349  ++b; // at least D and RHS received and sent back
1350  if (all_bb_relation[j] > 2) { // two of these
1351 // need to fill in off diag after this iteration is done and then re-iterate
1352 // because they are at the end of the buffer section for machine i.
1353  ++br; // an offdiag received
1354  // though I suppose it does not
1355  // need to be sent back
1356 
1357  }
1358  }
1359  }
1360 // reiterate and fill in off diag
1361  br = 0;
1362  for (j=displ[i]; j < displ[i+1]; ++j) {
1363  int j1 = mark[j];
1364  if (j1 >= 0 && all_bb_relation[j] >= 2
1365  && rthost[j1] == nrnmpi_myid
1366  && rthost[j1] != i) {
1367  if (all_bb_relation[j] > 2) { // two of these
1368  int ib = mdisp + 2*b + br;
1369 //printf("%d offdiag fill buf %d j1=%d rt=%p\n", nrnmpi_myid, ib, j1, rt[j1]);
1370 rt[j1]->fillrmap(allsid[j+1], allsid[j], trecvbuf_ + ib);
1371 rt[j1]->fillrmap(allsid[j], allsid[j+1], trecvbuf_ + ib + 1);
1372  br += 2;
1373  j += 1;
1374  }
1375  }
1376  }
1377 
1378  if (b || br) {
1379  msti_[nthost_].displ_ = mdisp;
1380  msti_[nthost_].size_ = 2*b + br;
1381  mdisp += 2*b + br;
1382  msti_[nthost_].host_ = i;
1383  msti_[nthost_].rthost_ = nrnmpi_myid;
1384  msti_[nthost_].nnode_ = b;
1385  // no nnode_rt_ for this phase
1386  //msti_[nthost_].nnode_rt_ = br;
1387  //msti_[nthost_].offdiag_ = new double*[br];
1388  msti_[nthost_].tag_ = 3;
1389  ++nthost_;
1390  }
1391  }
1392 
1393  // fifth pass for the short->long backbones
1394  int tmp_index = k;
1395  ihost_short_long_ = nthost_;
1396  for (i=0; i < nrnmpi_numprocs; ++i) {
1397  int b = 0;
1398  for (j=displ[i]; j < displ[i+1]; ++j) {
1399  if (mark[j] >= 0 && bb_relation[mark[j]] == 1 && all_bb_relation[j] == 0) {
1400  nodeindex_buffer_th_[k] = threadid[mark[j]];
1401  nodeindex_buffer_[k++] = inode[mark[j]];
1402 //printf("%d i=%d j=%d k=%d nthost=%d mark=%d inode=%d\n", nrnmpi_myid, i, j, k, nthost_, mark[j], inode[mark[j]]);
1403  ++b;
1404  }
1405  }
1406  if (b) {
1407  msti_[nthost_].displ_ = mdisp;
1408  msti_[nthost_].size_ = 2*b;
1409  mdisp += 2*b;
1410  msti_[nthost_].host_ = i;
1411  msti_[nthost_].nnode_ = b;
1412  msti_[nthost_].tag_ = 1;
1413  ++nthost_;
1414  }
1415  }
1416 
1417 #if 0
1418 printf("%d nthost_ = %d\n", nrnmpi_myid, nthost_);
1419 for (i=0; i < nthost_; ++i) {
1420 MultiSplitTransferInfo& m = msti_[i];
1421 printf("%d i=%d displ_=%d host_=%d nnode_=%d tag_=%d\n", nrnmpi_myid, i, m.displ_,
1422 m.host_, m.nnode_, m.tag_);
1423 }
1424 #endif
1425 
1426 #if 0
1427 for (i=0; i < k; ++i) {
1428 j=nodeindex_buffer_[i];
1429 printf("%d %d nodeindex_buffer %d %s %d\n", nrnmpi_myid, i, j,
1430 secname(v_node[j]->sec), v_node[j]->sec_node_index_);
1431 }
1432 #endif
1433 
1434  // the sids on this machine with rthost on this machine
1435  // go into the ReducedTree directly
1436  if (nrtree_) {
1437  i = 0;
1438  int ib = 0;
1439  MultiSplit* ms;
1440  for (int ii=0; ii < multisplit_list_->count(); ++ii) {
1441  ms = multisplit_list_->item(ii);
1442  NrnThread* _nt = nrn_threads + ms->ithread;
1443  MultiSplitThread& t = mth_[ms->ithread];
1444  if (ms->rthost == nrnmpi_myid) {
1445 //printf("%d nrtree_=%d i=%d rt=%p\n", nrnmpi_myid, nrtree_, i, rt[i]);
1446  int j = ms->nd[0]->v_node_index;
1447 //printf("%d call fillrmap sid %d,%d %d node=%d\n", nrnmpi_myid, ms->sid[0], ms->sid[0], j);
1448  ms->rt_ = rt[i];
1449  ms->rmap_index_ = rt[i]->irfill;
1450  ms->smap_index_ = rt[i]->nsmap;
1451  rt[i]->fillrmap(ms->sid[0], -1, &RHS(j));
1452  rt[i]->fillrmap(ms->sid[0], ms->sid[0], &D(j));
1453  rt[i]->fillsmap(ms->sid[0], &RHS(j), &D(j));
1454  if (ms->nd[1]) {
1455  ib = ms->back_index;
1456  assert(t.backsid_[ib] == ms->sid[0]);
1457  // fill sid1 row
1458 // note: for cvode need to keep fillrmap as pairs so following
1459 // moved from between sid1A,sid1B fillrmap.
1460  j = ms->nd[1]->v_node_index;
1461 //printf("%d call fillrmap sid1 %d %d node=%d\n", nrnmpi_myid, ms->sid[1], ms->sid[1], j);
1462  rt[i]->fillrmap(ms->sid[1], -1, &RHS(j));
1463  rt[i]->fillrmap(ms->sid[1], ms->sid[1], &D(j));
1464  rt[i]->fillsmap(ms->sid[1], &RHS(j), &D(j));
1465 
1466  // fill sid1A for sid0 row
1467 //printf("%d call fillrmap sid %d,%d %d ib=%d t.backsid=%d t.backAindex=%d\n",
1468 //nrnmpi_myid, ms->sid[1], ms->sid[0], ib, t.backsid_[ib], t.backAindex_[ib]);
1469  rt[i]->fillrmap(ms->sid[1], ms->sid[0], t.sid1A + t.backAindex_[ib]);
1470 
1471  // fill sid1B
1472 //printf("%d call fillrmap sid %d,%d ib=%d backBindex=%d\n", nrnmpi_myid, ms->sid[0], ms->sid[1], ib, backBindex_[ib]);
1473  rt[i]->fillrmap(ms->sid[0], ms->sid[1], t.sid1B + t.backBindex_[ib]);
1474  }
1475  }
1476  ++i; if (ms->nd[1]) { ++i; }
1477  }
1478  }
1479 
1480  if (nthost_) {
1481  msti_[0].nodeindex_th_ = nodeindex_buffer_th_;
1482  msti_[0].nodeindex_ = nodeindex_buffer_;
1483  for (i=1; i < nthost_; ++i) {
1484  msti_[i].nodeindex_th_ = msti_[i-1].nodeindex_th_
1485  + msti_[i-1].nnode_;
1486  msti_[i].nodeindex_ = msti_[i-1].nodeindex_
1487  + msti_[i-1].nnode_;
1488  }
1489  }
1490  // count how many nodeindex_buffer are non-zero area nodes
1491  // iarea_short_long_ and narea_ only for ones
1492  // not related to the ReducedTree method
1493 //printf("%d ndbsize=%d\n", nrnmpi_myid, ndbsize);
1494  for (i=0; i < ndbsize; ++i) {
1495  NrnThread* _nt = nrn_threads + nodeindex_buffer_th_[i];
1496  Node* nd = _nt->_v_node[nodeindex_buffer_[i]];
1497  if (i == tmp_index) {
1498  iarea_short_long_ = narea_;
1499  }
1500  if (nd->_classical_parent
1501  && nd->sec_node_index_ < nd->sec->nnode - 1) {
1502  if (nodeindex_rthost_[i] < 0) {
1503  ++narea_;
1504  }
1505  }
1506  }
1507 //printf("%d narea=%d iarea_short_long=%d\n", nrnmpi_myid, narea_, iarea_short_long_);
1508  if (narea_) {
1509  buf_area_indices_ = new int[narea_];
1510  area_node_indices_ = new int[narea_];
1511  k = 0;
1512  for (i=0; i < ndbsize; ++i) {
1513  NrnThread* _nt = nrn_threads + nodeindex_buffer_th_[i];
1514  Node* nd = _nt->_v_node[nodeindex_buffer_[i]];
1515  if (nd->_classical_parent
1516  && nd->sec_node_index_ < nd->sec->nnode - 1) {
1517  buf_area_indices_[k] = 2*i;
1518  area_node_indices_[k] = nodeindex_buffer_[i];
1519  ++k;
1520  }
1521  }
1522  }
1523 
1524  // look through the nthost list and fill in the area2buf receive buffer
1525  // indices for info sent to reducetrees (by definition,not on this machine)
1526  for (i=0; i < nthost_; ++i) {
1527  MultiSplitTransferInfo& msti = msti_[i];
1528  if (msti.tag_ == 3) { // reduced tree related
1529  // but we only want to, not from, the reduced tree machine
1530  // any nodes nonzero area?
1531 //printf("%d i=%d nnode=%d nodeindex=%p host=%d rthost=%d\n", nrnmpi_myid, i, msti.nnode_, msti.nodeindex_, msti.host_, msti.rthost_);
1532  if (msti.rthost_ != nrnmpi_myid) for (j=0; j < msti.nnode_; ++j) {
1533  NrnThread* _nt = nrn_threads + msti.nodeindex_th_[j];
1534  int in = msti.nodeindex_[j];
1535  Node* nd = _nt->_v_node[in];
1536  if (nd->_classical_parent
1537  && nd->sec_node_index_ < nd->sec->nnode - 1) {
1538  // non-zero area
1539  // search the area2buf list
1540  for (k = 0; k < narea2buf_; ++k) {
1541  if (area2buf_[k].inode == in
1542  && area2buf_[k].ms->ithread == _nt->id ) {
1543  break;
1544  }
1545  }
1546  assert(k < narea2buf_);
1547  Area2Buf& ab = area2buf_[k];
1548  // ok. fill in the ibuf info
1549  ab.ibuf[0] = msti.displ_ + 2*j;
1550  ab.ibuf[1] = ab.ibuf[0] + 1;
1551  if (ab.n == 3) {
1552  // which offdiag item?
1553  int ioff;
1554 for (ioff = 0; ioff < msti.nnode_rt_; ++ioff) {
1555  if (msti.nd_rt_index_[ioff] == in && msti.nd_rt_index_th_[ioff] == _nt->id) {
1556  break;
1557  }
1558 }
1559  assert(ioff < msti.nnode_rt_);
1560  ab.ibuf[2] = msti.displ_
1561  + 2*msti.nnode_ + ioff;
1562  }
1563 #if 0
1564 printf("%d in=%d n=%d ibuf %d %d %d\n", nrnmpi_myid, in, ab.n,
1565 ab.ibuf[0], ab.ibuf[1], (ab.n == 3) ? ab.ibuf[2] : -1);
1566 #endif
1567  }
1568  }
1569  }
1570  }
1571 
1572 //printf("%d leave exchange_setup\n", nrnmpi_myid);
1573 //nrnmpi_int_allgather(&n, nn, 1);
1574 
1575  delete [] nn;
1576  delete [] sid;
1577  delete [] inode;
1578  delete [] threadid;
1579  delete [] displ;
1580  delete [] allsid;
1581  delete [] vec2ms;
1582  delete [] mark;
1583  delete [] bb_relation;
1584  delete [] all_bb_relation;
1585  delete [] connects2short;
1586  delete [] rthost;
1587  delete [] rt;
1588 
1589  errno = 0;
1590 }
1591 
1592 // when d and rhs pointers are updated in the Node, they must be updated in
1593 // the reducedtree rmap and smap as well
1595  msc_->rt_map_update();
1596 }
1597 
1599  for (int i=0; i < multisplit_list_->count(); ++i) {
1600  MultiSplit& ms = *multisplit_list_->item(i);
1601  if (ms.rthost == nrnmpi_myid) { // reduced tree on this host
1602  assert(ms.rt_);
1603  assert(ms.rmap_index_ >= 0);
1604  assert(ms.smap_index_ >= 0);
1605  MultiSplitThread& t = mth_[ms.ithread];
1606  double** r = ms.rt_->rmap + ms.rmap_index_;
1607  double** s = ms.rt_->smap + ms.smap_index_;
1608  for (int j=0; j < 2; ++j) if (ms.nd[j]) {
1609  *s++ = *r++ = &NODERHS(ms.nd[j]);
1610  *s++ = *r++ = &NODED(ms.nd[j]);
1611  }
1612  if (ms.nd[1]) { // do sid1a and sid1b as well
1613  assert(ms.back_index >= 0);
1614  *r++ = t.sid1A + t.backAindex_[ms.back_index];
1615  *r++ = t.sid1B + t.backBindex_[ms.back_index];
1616  }
1617  }
1618  }
1619  // also need to do the Area2RT.pd[3] where the
1620  // order is d, rhs, and possibly sid1A or sid1B
1621  for (int i = 0; i < narea2rt_; ++i) {
1622  Area2RT& art = area2rt_[i];
1623  MultiSplit& ms = *art.ms;
1624  NrnThread* _nt = nrn_threads + ms.ithread;
1625  art.pd[0] = &D(art.inode);
1626  art.pd[1] = &RHS(art.inode);
1627  if (art.n == 3) {
1628  MultiSplitThread& t = mth_[ms.ithread];
1629  if (art.inode == ms.nd[0]->v_node_index) {
1630  art.pd[2] = t.sid1A + t.backAindex_[ms.back_index];
1631  }else if (art.inode == ms.nd[1]->v_node_index) {
1632  art.pd[2] = t.sid1B + t.backBindex_[ms.back_index];
1633  }else{
1634  assert(0);
1635  }
1636  }
1637  }
1638 }
1639 
1641  int i, j, k, id;
1642  id = nrnmpi_myid;
1643  // assume only one thread when there is transfer info
1644  NrnThread* nt = nrn_threads;
1645  Printf("%d nthost_=%d\n", id, nthost_);
1646  for (i=0; i < nthost_; ++i) {
1647  MultiSplitTransferInfo& ms = msti_[i];
1648  Printf("%d %d host=%d nnode=%d displ=%d\n", id, i, ms.host_, ms.nnode_, ms.displ_);
1649  for (j=0; j < ms.nnode_; ++j) {
1650  k = ms.nodeindex_[j];
1651  Printf("%d %d %d %d %s %d\n",
1652 id, i, j, k, secname(nt->_v_node[k]->sec), nt->_v_node[k]->sec_node_index_);
1653  }
1654  }
1655 }
1656 
1657 void MultiSplitControl::reduced_mark(int m, int sid, int nt, int* mark, int* allsid,
1658  int* all_bb_relation) {
1659  int i;
1660  for (i = 0; i < nt; ++i) {
1661  if (mark[i] == -1 && allsid[i] == sid) {
1662  mark[i] = m;
1663  if (all_bb_relation[i] > 2) {
1664  int sid2 = all_bb_relation[i] - 3;
1665 reduced_mark(m, sid2, nt, mark, allsid, all_bb_relation);
1666  }
1667  }
1668  }
1669 }
1670 
1671 /*
1672 section ParallelContext.multisplit_connect(x, sid) is a generalization of
1673 root ParallelContext.splitcell_connect(adjacent_host).
1674 sid refers to a point and all points with the same sid are connected
1675 together with 0 resistance wires.
1676 If a connected set of sections has only 1 sid then that point will be the
1677 root and triangularization can proceed normally.
1678 If a connected set of sections has 2 sid then triangularization proceeds
1679 from the leaves to the backbone between the 2 sid which forms a cable.
1680 Then special triangularization with fill-in of the sid1A column
1681 proceeds from the backbone element adjacent to sid1 and then with fill-in
1682 of the sid1B column (which modifies the sid1A column) from the backbone end
1683 adjacent to sid0
1684 in the hope that the fill-in elements at the sid are small enough
1685 to ignore in the interprocessor jacobian element matrix exchange. Note
1686 that gaussian elimination in the backbone is more complex as normal
1687 tree elimination with the same number of nodes.
1688 We defer the implementation of more than 2 sid in a subtree.
1689 */
1690 
1691 /* Gaussian elimination strategy for the backbone.
1692 There are two different strategies that optimize either minimum
1693 number of matrix operations that is best for long backbones, or
1694 numerical stability by minimizing the far off diagonal just before
1695 exchange which would be best for short (tightly coupled) backbones.
1696 
1697 For long backbones, it is simplest to begin triangularization
1698 using the equations adjacent to the center as pivot equations.
1699 This fills in a single column. But in the trivial case of the center
1700 being the only interior point, there are no triangularization operations
1701 before exchange.
1702 1 xx
1703 2 xxx
1704 3 xxx
1705 4 xxx
1706 5 xxx
1707 6 xxx
1708 7 xx
1709 
1710 
1711 1 x x
1712 2 xx x
1713 3 xxx
1714 4 xxx
1715 5 xxx
1716 6 x xx
1717 7 x x
1718 
1719 exchange occurs
1720 
1721 1 x x
1722 2 x x
1723 3 xx
1724 4 x
1725 5 xx
1726 6 x x
1727 7 x x
1728 
1729 For short backbones and greatest stability/accuracy and still simple but
1730 many more operations, one begins adjacent to the sids. Now,
1731 in the trivial case of the center being the only interior point, it is
1732 used to reduce the off diagonal element. If there is no interior point
1733 nothing happens.
1734 
1735 1 xx
1736 2 xxx triangularize starting with equation 6 (cannot start with
1737 3 xxx equation 1 or 7 because we do not know the correct value for d)
1738 4 xxx this eliminates the a elements but at the cost of introducing
1739 5 xxx A elements in column 7
1740 6 xxx
1741 7 xx
1742 
1743 1 x x cannot use 1, instead do the same starting at 2
1744 2 xx x this eliminates b elements but at the cost of introducing
1745 3 xx x B elements in column 1. Also A elements are modified.
1746 4 xx x
1747 5 xx x
1748 6 xxx
1749 7 xx
1750 
1751 1 x x
1752 2 xx x
1753 3 x x x
1754 4 x x x
1755 5 x x x
1756 6 x xx
1757 7 x x
1758 
1759 exchange occurs (modified d for 1 and 7
1760 solve the 1 and 7 2x2 equations then back substitute
1761 
1762 */
1763 
1765  int id, i, it;
1766  for (id=0; id < nrnmpi_numprocs; ++id) {
1767  nrnmpi_barrier();
1768  if (id == nrnmpi_myid) {
1769  Printf("myid=%d\n", id);
1770  Printf(" MultiSplit %ld\n", multisplit_list_->count());
1771  for (i=0; i < multisplit_list_->count(); ++i) {
1772  MultiSplit* ms = multisplit_list_->item(i);
1773  Node* nd = ms->nd[0];
1774  Printf(" %2d bbs=%d bi=%-2d rthost=%-4d %-4d %s{%d}", i, ms->backbone_style,
1775  ms->back_index, ms->rthost, ms->sid[0], secname(ms->nd[0]->sec),
1776  ms->nd[0]->sec_node_index_);
1777  if (ms->nd[1]) {
1778  Printf(" %-4d %s{%d}", ms->sid[1], secname(ms->nd[1]->sec),
1779  ms->nd[1]->sec_node_index_);
1780  }
1781  Printf("\n");
1782  }
1783  for (it = 0; it < nrn_nthread; ++it) {
1784  NrnThread* _nt = nrn_threads + it;
1785  MultiSplitThread& t = mth_[it];
1786  Printf(" backbone_begin=%d backbone_long_begin=%d backbone_interior_begin=%d\n", t.backbone_begin, t.backbone_long_begin, t.backbone_interior_begin);
1787  Printf(" backbone_sid1_begin=%d backbone_long_sid1_begin=%d backbone_end=%d\n", t.backbone_sid1_begin, t.backbone_long_sid1_begin, t.backbone_end);
1788  Printf(" nbackrt_=%d i, backsid_[i], backAindex_[i], backBindex_[i]\n", t.nbackrt_);
1789  if (t.nbackrt_) {
1790  for (int i=0; i < t.nbackrt_; ++i) {
1791  Printf(" %2d %2d %5d %5d", i, t.backsid_[i], t.backAindex_[i], t.backBindex_[i]);
1792  Node* nd = _nt->_v_node[t.backbone_begin + t.backAindex_[i]];
1793  Printf(" %s{%d}", secname(nd->sec), nd->sec_node_index_);
1794  nd = _nt->_v_node[t.backbone_begin + t.backBindex_[i]];
1795  Printf(" %s{%d}", secname(nd->sec), nd->sec_node_index_);
1796  Printf("\n");
1797  }
1798  }
1799  }
1800  Printf(" ReducedTree %d\n", nrtree_);
1801  for (i=0; i < nrtree_; ++i) {
1802  ReducedTree* rt = rtree_[i];
1803  Printf(" %d n=%d nmap=%d\n", i, rt->n, rt->nmap);
1804  rt->pr_map(tbsize, trecvbuf_);
1805  }
1806  Printf(" MultiSplitTransferInfo %d\n", nthost_);
1807  for (i=0; i < nthost_; ++i) {
1808  MultiSplitTransferInfo& m = msti_[i];
1809 Printf(" %d host=%d rthost=%d nnode=%d nnode_rt=%d size=%d tag=%d\n",
1810 i, m.host_, m.rthost_, m.nnode_, m.nnode_rt_, m.size_, m.tag_);
1811  if (m.nnode_) {
1812 Printf(" nodeindex=%p nodeindex_buffer = %p\n", m.nodeindex_,nodeindex_buffer_);
1813  }
1814  }
1815  Printf(" ndbsize=%d i nodeindex_buffer_=%p nodeindex_rthost_=%p\n",
1816  ndbsize, nodeindex_buffer_, nodeindex_rthost_);
1817  if (ndbsize) {
1818  for (int i=0; i < ndbsize; ++i) {
1819  Printf(" %d %d %d\n", i, nodeindex_buffer_[i], nodeindex_rthost_[i]);
1820  }
1821  }
1822  Printf(" tbsize=%d trecvbuf_=%p tsendbuf_=%p\n", tbsize,
1823  trecvbuf_, tsendbuf_);
1824  Printf("\n");
1825  }
1826  }
1827  nrnmpi_barrier();
1828 }
1829 
1830 // following uses the short backbone method (N form of the matrix)
1831 
1832 
1833 // What is a good order for the above? We already have a tree order structure
1834 // with a root. If there is a single sid involved then that should be the
1835 // root and we are conceptually the same as splitcell.cpp
1836 // For two sid, then the tree is ordered as (see above comment at the NOTE)
1837 
1839  msc_->solve();
1840 }
1841 
1843  msc_->mth_[nt->id].triang(nt);
1844  return nullptr;
1845 }
1847  if (nt->id == 0) {
1848  msc_->reduce_solve();
1849  }
1850  return nullptr;
1851 }
1853  msc_->mth_[nt->id].bksub(nt);
1854  return nullptr;
1855 }
1856 
1858 // if (t < .025) prstruct();
1859 // if (nrnmpi_myid == 0) pmat(true);
1860  NrnThread* nt = nrn_threads;
1861  MultiSplitThread& t = mth_[0];
1863  t.triang_backbone(nt);
1864 // if (nrnmpi_myid == 4) pmat(true);
1865 // pmat1("t");
1866 //printf("%d enter matrix exchange\n", nrnmpi_myid);
1867  matrix_exchange();
1868 //printf("%d leave matrix exchange\n", nrnmpi_myid);
1869 // pmat1("e");
1870  t.bksub_backbone(nt);
1871  t.bksub_subtrees(nt);
1872 // pmat(true);
1873 // nrnmpi_barrier(); // only possible if everyone is actually multisplit
1874 // abort();
1875 }
1876 
1878  triang_subtree2backbone(_nt);
1879  triang_backbone(_nt);
1880 }
1882  matrix_exchange();
1883 }
1885  bksub_backbone(_nt);
1886  bksub_subtrees(_nt);
1887 }
1888 
1889 // In the typical case, all nodes connected to the same sids have 0 area
1890 // and thus the weighted average voltage is just sum RHS / sum D.
1891 // However, nrn_multisplit_nocap_v is somewhat of a misnomer because one or
1892 // more compartments (but not connected together with the same sid) may have
1893 // non-zero area and thus that voltage is correct and must be transferred
1894 // to all the connecting (with same sid) zero-area nodes. Furthermore
1895 // the sum RHS (of the zero area nodes) must be added to the RHS of non-zero
1896 // area node (after that non-zero RHS has been computed back in the caller).
1897 // Lastly, that sum of RHS has to be adjusted using the sum of D of the
1898 // zero area nodes so that the RHS is proper with respect to the correct
1899 // value of V.
1900 // So. When there is no non-zero area node for a sid, the voltage is
1901 // sum rhs / sum d. For a non-zero area node the V must be sent and the
1902 // sum rhs and sum d from the others must be received and the true rhs
1903 // put into adjust_rhs_ using rhs - d*v. For a zero-area
1904 // node (with same sid as non-zero area node), rhs and d must be sent, and
1905 // voltage can be computed after receiving the changed rhs and d by
1906 // rhs/d.
1907 
1912 }
1914  msc_->multisplit_nocap_v_part1(nt);
1915 }
1917  msc_->multisplit_nocap_v_part2(nt);
1918 }
1920  msc_->multisplit_nocap_v_part3(nt);
1921 }
1922 
1924  int i;
1925  // scale the non-zero area node elements since those already
1926  // have the answer.
1927 
1928  // the problem here is that non-zero area v is correct,
1929  // but rhs ends up wrong for
1930  // non-zero area nodes (because current from zero area not added)
1931  // so encode v into D and sum of zero-area rhs will end up in
1932  // rhs.
1933  if (_nt->id == 0) for (i=0; i < narea2buf_; ++i) {
1934  Area2Buf& ab = area2buf_[i];
1935  VEC_D(ab.inode) = 1e50; // sentinal
1936  VEC_RHS(ab.inode) = VEC_V(ab.inode)*1e50;
1937  }
1938  // also scale the non-zero area elements on this host
1939  for (i=0; i < narea2rt_; ++i) {
1940  Area2RT& ar = area2rt_[i];
1941  if (_nt->id == ar.ms->ithread) {
1942  VEC_D(ar.inode) = 1e50;
1943  VEC_RHS(ar.inode) = VEC_V(ar.inode)*1e50;
1944  }
1945  }
1946 }
1948  if (_nt->id == 0) {matrix_exchange_nocap();}
1949 }
1951  // at this point, for zero area nodes, D is
1952  // 1.0, and the voltage is RHS/D).
1953  // So zero-area node information is fine.
1954  // But for non-zero area nodes, D is the sum of all zero-area
1955  // node d, and RHS is the sum of all zero-area node rhs.
1956  int i, j;
1957 
1958  if (_nt->id == 0) for (i=0; i < narea2buf_; ++i) {
1959  Area2Buf& ab = area2buf_[i];
1960  int j = ab.inode;
1961  double afac = 100. / VEC_AREA(j);
1962  ab.adjust_rhs_ = (VEC_RHS(j) - VEC_D(j)*VEC_V(j)) * afac;
1963 //printf("%d nz1 %d D=%g RHS=%g V=%g afac=%g adjust=%g\n",
1964 //nrnmpi_myid, i, D(i), RHS(i), VEC_V(j), afac, ab.adjust_rhs_);
1965  }
1966  for (i=0; i < narea2rt_; ++i) {
1967  Area2RT& ar = area2rt_[i];
1968  if (_nt->id == ar.ms->ithread) {
1969  int j = ar.inode;
1970  double afac = 100. / VEC_AREA(j);
1971  ar.adjust_rhs_ = (VEC_RHS(j) - VEC_D(j)*VEC_V(j)) * afac;
1972 //printf("%d nz2 %d D=%g RHS=%g V=%g afac=%g adjust=%g\n",
1973 //nrnmpi_myid, i, D(i), RHS(i), VEC_V(j), afac, ar.adjust_rhs_);
1974  }
1975  }
1976 }
1977 
1979  msc_->multisplit_adjust_rhs(nt);
1980 }
1981 
1983  int i;
1984  if (_nt->id == 0) for (i=0; i < narea2buf_; ++i) {
1985  Area2Buf& ab = area2buf_[i];
1986  VEC_RHS(ab.inode) += ab.adjust_rhs_;
1987  }
1988  // also scale the non-zero area elements on this host
1989  for (i=0; i < narea2rt_; ++i) {
1990  Area2RT& ar = area2rt_[i];
1991  if (_nt->id == ar.ms->ithread) {
1992 //printf("%d adjust %d %g %g\n",
1993 //nrnmpi_myid, ar.inode, ar.adjust_rhs_, VEC_RHS(ar.inode));
1994  VEC_RHS(ar.inode) += ar.adjust_rhs_;
1995  }
1996  }
1997 }
1998 
1999 // Two phases: Send all the info from long backbones and receive
2000 // all the short backbone info for this machine.
2001 // Then process the 2x2 short backbone and send remaining (short)
2002 // backbone and receive the rest (long) of the backbone info.
2003 
2004 // The ReducedTree solve fits into this strategy receiving from long at the
2005 // same time as receiving from short and
2006 // sending to long at the same time as sending from short
2007 
2008 // The MultiSplitTransferInfo order is
2009 // this -> other
2010 // long -> short 0, ihost_long_long-1
2011 // long -> long ihost_long_long, ihost_long_reduced-1
2012 // long -> reduced ihost_long_reduced, ihost_reduced_long-1
2013 // reduced -> long ihost_reduced_long, ihost_short_long-1
2014 // short -> long ihost_short_long, nthost_
2015 
2017  int i, j, jj, k;
2018  double* tbuf;
2019  double rttime;
2020  double wt = nrnmpi_wtime();
2021  NrnThread* _nt;
2022  // the mpi strategy is copied from the
2023  // cvode/examples_par/pvkxb.cpp exchange strategy
2024 
2025 #define EXCHANGE_ON 1
2026 #if EXCHANGE_ON
2027  // post all the receives
2028  for (i=0; i < nthost_; ++i) {
2029  int tag;
2030  MultiSplitTransferInfo& mt = msti_[i];
2031  tag = mt.tag_;
2032  // receiving the result from a reduced tree is tag 4.
2033  if (tag == 3 && nrnmpi_myid != mt.rthost_) { tag = 4; }
2034  nrnmpi_postrecv_doubles(trecvbuf_ + mt.displ_, mt.size_,
2035  mt.host_, tag, &mt.request_);
2036 #if 0
2037 printf("%d post receive %d displ=%d size=%d host=%d tag=%d\n",
2038 nrnmpi_myid, i, mt.displ_, mt.size_, mt.host_, tag);
2039 #endif
2040  }
2041 
2042  // Note that we are assuming only one thread (_nt) when
2043  // mpi is used. (In order for D and RHS to make sense)
2044  // fill the send buffer with the long backbone info
2045  // i.e. long->short, long -> long, long -> reduced
2046  for (i=0; i < ihost_reduced_long_; ++i) {
2047  MultiSplitTransferInfo& mt = msti_[i];
2048  j = 0;
2049  tbuf = tsendbuf_ + mt.displ_;
2050  for (jj = 0; jj < mt.nnode_; ++jj) {
2051  k = mt.nodeindex_[jj];
2052  _nt = nrn_threads + mt.nodeindex_th_[jj];
2053  tbuf[j++] = D(k);
2054  tbuf[j++] = RHS(k);
2055  }
2056  // each sent backbone will have added 2 to mt.nnode_rt_
2057  for (jj = 0; jj < mt.nnode_rt_; ++jj) {
2058  tbuf[j++] = *mt.offdiag_[jj];
2059  }
2060 #if 0
2061 //if (nrnmpi_myid == 4) {
2062 printf("%d send to %d nnode=%d nnode_rt=%d size=%d tag=%d\n",
2063 nrnmpi_myid, mt.host_, mt.nnode_, mt.nnode_rt_, mt.size_, mt.tag_);
2064 //}
2065 #endif
2066 #if 0
2067 //if (nrnmpi_myid == 4) {
2068  for (j = 0; j < mt.nnode_; ++j) {
2069 printf("%d send to %d tbuf[%d] = %g tbuf[%d] = %g from node %d\n",
2070 nrnmpi_myid, mt.host_, 2*j, tbuf[2*j], 2*j+1, tbuf[2*j+1], mt.nodeindex_[j]);
2071  }
2072  for (j = 0; j < mt.nnode_rt_; ++j) {
2073  int jj = 2*mt.nnode_ + j;
2074 printf("%d send to %d offdiag tbuf[%d] = %g\n",
2075 nrnmpi_myid, mt.host_, jj, tbuf[jj]);
2076  }
2077 //)
2078 #endif
2079  }
2080 
2081  // adjust area for any D, RHS, sid1A, sid1B going to a ReducedTree host
2082  for (i=0; i < narea2buf_; ++i) {
2083  Area2Buf& ab = area2buf_[i];
2084  _nt = nrn_threads + ab.ms->ithread;
2085  double afac = 0.01 * VEC_AREA(ab.inode);
2086  tbuf = tsendbuf_;
2087  for (j = 0; j < ab.n; ++j) {
2088  tbuf[ab.ibuf[j]] *= afac;
2089 #if 0
2090 printf("%d area2buf * afac=%g i=%d j=%d node=%d ibuf=%d buf=%g\n", nrnmpi_myid, afac,
2091 i, j, ab.inode, ab.ibuf[j], tbuf[ab.ibuf[j]]);
2092 #endif
2093  }
2094  }
2095 
2096  // send long backbone info (long->short, long->long)
2097  for (i=0; i < ihost_reduced_long_; ++i) {
2098  MultiSplitTransferInfo& mt = msti_[i];
2099  tbuf = tsendbuf_ + mt.displ_;
2100  nrnmpi_send_doubles(tbuf, mt.size_, mt.host_, mt.tag_);
2101 #if 0
2102 printf("%d post send %d displ=%d size=%d host=%d tag=%d\n",
2103 nrnmpi_myid, i, mt.displ_, mt.size_, mt.host_, mt.tag_);
2104 #endif
2105  }
2106 
2107 #if 0
2108  for (i=ihost_reduced_long_; i < nthost_; ++i) {
2109  MultiSplitTransferInfo& mt = msti_[i];
2110 printf("%d will send result to %d size=%d tag=%d\n", nrnmpi_myid, mt.host_, mt.size_, mt.tag_);
2111  }
2112 #endif
2113  // handle the reduced trees and short backbones
2114 
2115  // wait for receives from the long backbones needed by reduced tree
2116  // and short backbones to complete (reduced <- long, short <- long)
2117  for (i=ihost_reduced_long_; i < nthost_; ++i) {
2118  MultiSplitTransferInfo& mt = msti_[i];
2119 #if 0
2120 printf("%d wait one %d\n", nrnmpi_myid, mt.host_);
2121 #endif
2122  nrnmpi_wait(&mt.request_);
2123 #if 0
2124 printf("%d receive from %d nnode=%d nnode_rt=%d size=%d tag=%d displ=%d\n",
2125 nrnmpi_myid, mt.host_, mt.nnode_, mt.nnode_rt_, mt.size_, mt.tag_, mt.displ_);
2126 //for (j=0; j < mt.size_; ++j) { printf("%d receive tbuf[%d]=%g\n", nrnmpi_myid, j, trecvbuf_[mt.displ_ + j]);}
2127 #endif
2128  }
2129 #if 0
2130 for (i=0; i < tbsize_; ++i) { printf("%d trecvbuf[%d] = %g\n", nrnmpi_myid, i, trecvbuf_[i]);}
2131 #endif
2132 
2133  // measure reducedtree,short backbone computation time
2134  rttime = nrnmpi_wtime();
2135 
2136  // adjust area in place for any D, RHS, sid1A, sid1B on this host
2137  // going to ReducedTree on this host
2138 //if (narea2rt_) printf("%d adjust area in place\n", nrnmpi_myid);
2139  for (i=0; i < narea2rt_; ++i) {
2140  Area2RT& ar = area2rt_[i];
2141  NrnThread* _nt = nrn_threads + ar.ms->ithread;
2142  double afac = 0.01 * VEC_AREA(ar.inode);
2143  for (j = 0; j < ar.n; ++j) {
2144  *ar.pd[j] *= afac;
2145  }
2146  }
2147 #endif //EXCHANGE_ON
2148 
2149  for (i = 0; i < nrtree_; ++i) {
2150  rtree_[i]->solve();
2151  }
2152 
2153 #if EXCHANGE_ON
2154  // measure reducedtree,short backbone computation time
2155  nrnmpi_rtcomp_time_ += nrnmpi_wtime() - rttime;
2156 
2157  // send reduced and short backbone info (reduced -> long, short -> long)
2158  for (i=ihost_reduced_long_; i < nthost_; ++i) {
2159  int tag;
2160  MultiSplitTransferInfo& mt = msti_[i];
2161  tbuf = tsendbuf_ + mt.displ_;
2162 //printf("%d send result to %d size=%d tag=%d\n", nrnmpi_myid, mt.host_, mt.size_, mt.tag_);
2163  tag = mt.tag_;
2164  // sending result from reduced tree means tag is 4
2165  if (tag == 3) { tag = 4; }
2166  nrnmpi_send_doubles(tbuf, mt.size_, mt.host_, tag);
2167  // only moderately wasteful (50%) in sending back irrelevant
2168  // off diag info. Required is only 2*mt.nnode_. But these
2169  // messages are pretty short, anyway.
2170  }
2171 
2172  // handle the long backbones
2173 
2174  // wait for all remaining receives to complete
2175  // long <- short, long <- long, long <- reduced
2176  for (i=0; i < ihost_reduced_long_; ++i) {
2177  MultiSplitTransferInfo& mt = msti_[i];
2178 #if 0
2179 printf("%d wait two %d\n", nrnmpi_myid, mt.host_);
2180 #endif
2181  nrnmpi_wait(&mt.request_);
2182 #if 0
2183 printf("%d receive from %d nnode=%d nnode_rt=%d size=%d tag=%d\n",
2184 nrnmpi_myid, mt.host_, mt.nnode_, mt.nnode_rt_, mt.size_, mt.tag_);
2185 #endif
2186  }
2187 
2188  // add to matrix (long <- long)
2189  // and also (long <- short) even though in principle should
2190  // not add those since already solved on short backbone machine
2191  // we can add since the ones sent were scaled by 1e30 which
2192  // will eliminate the effect of existing D, RHS, and S1A or S1B
2193  // as well as making the possible area scaling irrelevant
2194  // being able to handle them together means we do not need
2195  // 0 to ihost_long_long - 1 to refer to long <-> short
2196  for (i = 0; i < ihost_reduced_long_; ++i) {
2197  MultiSplitTransferInfo& mt = msti_[i];
2198  j = 0;
2199  tbuf = trecvbuf_ + mt.displ_;
2200  for (jj = 0; jj < mt.nnode_; ++jj) {
2201  k = mt.nodeindex_[jj];
2202  _nt = nrn_threads + mt.nodeindex_th_[jj];
2203  D(k) += tbuf[j++];
2204  RHS(k) += tbuf[j++];
2205  }
2206 #if 0
2207 if (nrnmpi_myid == 4) {
2208  for (j = 0; j < mt.nnode_; ++j) {
2209 printf("%d received from %d tbuf[%d] = %g tbuf[%d] = %g added to node %d\n",
2210 nrnmpi_myid, mt.host_, 2*j, tbuf[2*j], 2*j+1, tbuf[2*j+1], mt.nodeindex_[j]);
2211  }
2212 }
2213 #endif
2214  }
2215 #endif //EXCHANGE_ON
2216 
2217 #if PARANEURON
2219 #endif
2220  errno = 0;
2221 }
2222 
2223 void MultiSplitControl::matrix_exchange_nocap() { //copy of matrix_exchange() and modified
2224  int i, j, jj, k;
2225  double* tbuf;
2226  double rttime;
2227  double wt = nrnmpi_wtime();
2228  NrnThread* _nt;
2229  // the mpi strategy is copied from the
2230  // cvode/examples_par/pvkxb.cpp exchange strategy
2231 
2232 #define EXCHANGE_ON 1
2233 #if EXCHANGE_ON
2234  // post all the receives
2235  for (i=0; i < nthost_; ++i) {
2236  int tag;
2237  MultiSplitTransferInfo& mt = msti_[i];
2238  tag = mt.tag_;
2239  // receiving the result from a reduced tree is tag 4.
2240  if (tag == 3 && nrnmpi_myid != mt.rthost_) { tag = 4; }
2241  nrnmpi_postrecv_doubles(trecvbuf_ + mt.displ_, mt.size_,
2242  mt.host_, tag, &mt.request_);
2243 #if 0
2244 printf("%d post receive %d displ=%d size=%d host=%d tag=%d\n",
2245 nrnmpi_myid, i, mt.displ_, mt.size_, mt.host_, tag);
2246 #endif
2247  }
2248 
2249  // fill the send buffer with the long backbone info
2250  // i.e. long->short, long -> long, long -> reduced
2251  for (i=0; i < ihost_reduced_long_; ++i) {
2252  MultiSplitTransferInfo& mt = msti_[i];
2253  j = 0;
2254  tbuf = tsendbuf_ + mt.displ_;
2255  for (jj = 0; jj < mt.nnode_; ++jj) {
2256  k = mt.nodeindex_[jj];
2257  _nt = nrn_threads + mt.nodeindex_th_[jj];
2258  tbuf[j++] = D(k);
2259  tbuf[j++] = RHS(k);
2260  }
2261  // each sent backbone will have added 2 to mt.nnode_rt_
2262  for (jj = 0; jj < mt.nnode_rt_; ++jj) {
2263  tbuf[j++] = *mt.offdiag_[jj];
2264  }
2265 #if 0
2266 //if (nrnmpi_myid == 4) {
2267 printf("%d send to %d nnode=%d nnode_rt=%d size=%d tag=%d\n",
2268 nrnmpi_myid, mt.host_, mt.nnode_, mt.nnode_rt_, mt.size_, mt.tag_);
2269 //}
2270 #endif
2271 #if 0
2272 //if (nrnmpi_myid == 4) {
2273  for (j = 0; j < mt.nnode_; ++j) {
2274 printf("%d send to %d tbuf[%d] = %g tbuf[%d] = %g from node %d\n",
2275 nrnmpi_myid, mt.host_, 2*j, tbuf[2*j], 2*j+1, tbuf[2*j+1], mt.nodeindex_[j]);
2276  }
2277  for (j = 0; j < mt.nnode_rt_; ++j) {
2278  int jj = 2*mt.nnode_ + j;
2279 printf("%d send to %d offdiag tbuf[%d] = %g\n",
2280 nrnmpi_myid, mt.host_, jj, tbuf[jj]);
2281  }
2282 //}
2283 #endif
2284  }
2285 
2286  // send long backbone info (long->short, long->long)
2287  for (i=0; i < ihost_reduced_long_; ++i) {
2288  MultiSplitTransferInfo& mt = msti_[i];
2289  tbuf = tsendbuf_ + mt.displ_;
2290  nrnmpi_send_doubles(tbuf, mt.size_, mt.host_, mt.tag_);
2291 #if 0
2292 printf("%d post send %d displ=%d size=%d host=%d tag=%d\n",
2293 nrnmpi_myid, i, mt.displ_, mt.size_, mt.host_, mt.tag_);
2294 #endif
2295  }
2296 
2297 #if 0
2298  for (i=ihost_reduced_long_; i < nthost_; ++i) {
2299  MultiSplitTransferInfo& mt = msti_[i];
2300 printf("%d will send result to %d size=%d tag=%d\n", nrnmpi_myid, mt.host_, mt.size_, mt.tag_);
2301  }
2302 #endif
2303  // handle the reduced trees and short backbones
2304 
2305  // wait for receives from the long backbones needed by reduced tree
2306  // and short backbones to complete (reduced <- long, short <- long)
2307  for (i=ihost_reduced_long_; i < nthost_; ++i) {
2308  MultiSplitTransferInfo& mt = msti_[i];
2309 #if 0
2310 printf("%d wait one %d\n", nrnmpi_myid, mt.host_);
2311 #endif
2312  nrnmpi_wait(&mt.request_);
2313 #if 0
2314 printf("%d receive from %d nnode=%d nnode_rt=%d size=%d tag=%d displ=%d\n",
2315 nrnmpi_myid, mt.host_, mt.nnode_, mt.nnode_rt_, mt.size_, mt.tag_, mt.displ_);
2316 //for (j=0; j < mt.size_; ++j) { printf("%d receive tbuf[%d]=%g\n", nrnmpi_myid, j, trecvbuf_[mt.displ_ + j]);}
2317 #endif
2318  }
2319 #if 0
2320 for (i=0; i < tbsize_; ++i) { printf("%d trecvbuf[%d] = %g\n", nrnmpi_myid, i, trecvbuf_[i]);}
2321 #endif
2322 
2323  // measure reducedtree,short backbone computation time
2324  rttime = nrnmpi_wtime();
2325 
2326 #endif //EXCHANGE_ON
2327 
2328  // remember V may be encoded in D
2329  for (i = 0; i < nrtree_; ++i) {
2330  rtree_[i]->nocap();
2331  }
2332 
2333 #if EXCHANGE_ON
2334  // replace in matrix
2335  // for zero area nodes, D is 1.0 and RHS is V from zero area node.
2336  // for non-zero area nodes, they are sum from zero area.
2337  for (i=ihost_short_long_; i < nthost_; ++i) {
2338  MultiSplitTransferInfo& mt = msti_[i];
2339  j = 0;
2340  tbuf = trecvbuf_ + mt.displ_;
2341  for (jj = 0; jj < mt.nnode_; ++jj) {
2342  k = mt.nodeindex_[jj];
2343  _nt = nrn_threads + mt.nodeindex_th_[jj];
2344  D(k) = tbuf[j++];
2345  RHS(k) = tbuf[j++];
2346  }
2347 #if 0
2348  for (j = 0; j < mt.nnode_; ++j) {
2349 printf("%d received from %d tbuf[%d] = %g tbuf[%d] = %g added to node %d\n",
2350 nrnmpi_myid, mt.host_, 2*j, tbuf[2*j], 2*j+1, tbuf[2*j+1], mt.nodeindex_[j]);
2351  }
2352 #endif
2353  }
2354  // measure reducedtree,short backbone computation time
2355  nrnmpi_rtcomp_time_ += nrnmpi_wtime() - rttime;
2356 
2357  // send reduced and short backbone info (reduced -> long, short -> long)
2358  for (i=ihost_reduced_long_; i < nthost_; ++i) {
2359  int tag;
2360  MultiSplitTransferInfo& mt = msti_[i];
2361  tbuf = tsendbuf_ + mt.displ_;
2362 //printf("%d send result to %d size=%d tag=%d\n", nrnmpi_myid, mt.host_, mt.size_, mt.tag_);
2363  tag = mt.tag_;
2364  // sending result from reduced tree means tag is 4
2365  if (tag == 3) { tag = 4; }
2366  nrnmpi_send_doubles(tbuf, mt.size_, mt.host_, tag);
2367  // only moderately wasteful (50%) in sending back irrelevant
2368  // off diag info. Required is only 2*mt.nnode_. But these
2369  // messages are pretty short, anyway.
2370  }
2371 
2372  // handle the long backbones
2373 
2374  // wait for all remaining receives to complete
2375  // long <- short, long <- long, long <- reduced
2376  for (i=0; i < ihost_reduced_long_; ++i) {
2377  MultiSplitTransferInfo& mt = msti_[i];
2378 #if 0
2379 printf("%d wait two %d\n", nrnmpi_myid, mt.host_);
2380 #endif
2381  nrnmpi_wait(&mt.request_);
2382 #if 0
2383 printf("%d receive from %d nnode=%d nnode_rt=%d size=%d tag=%d\n",
2384 nrnmpi_myid, mt.host_, mt.nnode_, mt.nnode_rt_, mt.size_, mt.tag_);
2385 #endif
2386  }
2387 
2388  // replace in matrix
2389  // for zero area nodes, D is 1.0 and RHS is V from zero area node.
2390  // for non-zero area nodes, they are sum from zero area.
2391  for (i = 0; i < ihost_reduced_long_; ++i) {
2392  MultiSplitTransferInfo& mt = msti_[i];
2393  j = 0;
2394  tbuf = trecvbuf_ + mt.displ_;
2395  for (jj = 0; jj < mt.nnode_; ++jj) {
2396  k = mt.nodeindex_[jj];
2397  _nt = nrn_threads + mt.nodeindex_th_[jj];
2398  D(k) = tbuf[j++];
2399  RHS(k) = tbuf[j++];
2400  }
2401 #if 0
2402 if (nrnmpi_myid == 4) {
2403  for (j = 0; j < mt.nnode_; ++j) {
2404 printf("%d received from %d tbuf[%d] = %g tbuf[%d] = %g added to node %d\n",
2405 nrnmpi_myid, mt.host_, 2*j, tbuf[2*j], 2*j+1, tbuf[2*j+1], mt.nodeindex_[j]);
2406  }
2407 }
2408 #endif
2409  }
2410 #endif //EXCHANGE_ON
2411 
2412 #if PARANEURON
2414 #endif
2415  errno = 0;
2416 }
2417 
2419 //printf("ReducedTree::ReducedTree(%d, %d)\n", rank, mapsize);
2420  int i;
2421  msc = ms;
2422  n = rank;
2423  assert(n > 0);
2424  assert(mapsize > 0);
2425  ip = new int[n];
2426  rhs = new double[4*n];
2427  d = rhs + n;
2428  a = d + n;
2429  b = a + n;
2430 
2431  n2 = 2*n;
2432  n4 = 4*n;
2433  nmap = mapsize;
2434  smap = new double*[nmap];
2435  rmap = new double*[nmap];
2436  ismap = new int[nmap];
2437  irmap = new int[nmap];
2438  nzindex = new int[n];
2439  rmap2smap_index = new int[nmap];
2440  v = new double[n];
2441  nsmap = 0;
2442  irfill = 0;
2443  for (i = 0; i < nmap; ++i) {
2444  smap[i] = 0;
2445  ismap[i] = -1;
2446  rmap[i] = 0;
2447  irmap[i] = -1;
2448  rmap2smap_index[i] = -1;
2449  }
2450 }
2451 
2453  int i;
2454  delete [] ip;
2455  delete [] rhs;
2456  delete [] smap;
2457  delete [] ismap;
2458  delete [] rmap;
2459  delete [] irmap;
2460  delete [] nzindex;
2461  delete [] v;
2462  delete [] rmap2smap_index;
2463 }
2464 
2466  // remember that info from zero area nodes is D=d and RHS=rhs
2467  // and from non-zero area is D=1e50 and RHS=v*1e50
2468  // on return, the info for zero area if no nonzero should be
2469  // D=sum d and RHS=sum rhs
2470  // the info for zero area if nonzero should be D=1.0 and RHS=v
2471  // and the info for nonzero should be D=sum d and RHS=sum rhs.
2472  int i;
2473  for (i=0; i < n; ++i) {
2474  rhs[i] = 0.0;
2475  d[i] = 0.0;
2476  nzindex[i] = -1;
2477  }
2478  // not supposed to add when D=1e50. Note that order in rmap is
2479  // rhs followed by d
2480  for (i=0; i < nmap; i+=2) {
2481  int j = irmap[i]; // reduced matrix row index for rhs
2482  if (*rmap[i+1] == 1e50) {
2483  v[j] = *rmap[i] * 1e-50;
2484  nzindex[j] = rmap2smap_index[i];
2485  }else{
2486  rhs[j] += *rmap[i];
2487  d[j] += *rmap[i+1];
2488  }
2489  }
2490 #if 0
2491  for (i=0; i < n; ++i) {
2492  printf("%d ReducedTree %2d %12.5g %12.5g %d %12.5g\n",
2493  nrnmpi_myid, i, d[i], rhs[i], nzindex[i], v[i]);
2494  }
2495 #endif
2496  for (i=0; i < nsmap; i += 2) {
2497  int j = ismap[i]; // reduced matrix matrix row index for rhs
2498  if (nzindex[j] == -1 || i == nzindex[j]) {
2499  // for the non-zero area nodes and zero area nodes
2500  // when no non-zero area node involved
2501  *smap[i] = rhs[j];
2502  *smap[i+1] = d[j];
2503  }else{
2504  // for the zero area nodes
2505  *smap[i] = v[j];
2506  *smap[i+1] = 1.0;
2507  }
2508  }
2509 }
2510 
2512  int i;
2513  double p;
2514  gather();
2515 #if 0
2516  for (i=0; i < n; ++i) {
2517  printf("%d ReducedTree %2d %12.5g %12.5g %2d %12.5g %12.5g\n",
2518  nrnmpi_myid, i, b[i], d[i], ip[i], ip[i] >= 0?a[i]:0., rhs[i]);
2519  }
2520 #endif
2521  // triangularization
2522  for (i = n - 1; i > 0; --i) {
2523  p = a[i]/d[i];
2524  d[ip[i]] -= p*b[i];
2525  rhs[ip[i]] -= p*rhs[i];
2526  }
2527  // back substitution
2528  rhs[0] /= d[0];
2529  for (i = 1; i < n; ++i) {
2530  rhs[i] -= b[i]*rhs[ip[i]];
2531  rhs[i] /= d[i];
2532  }
2533 #if 0
2534  for (i=0; i < n; ++i) {
2535  printf("%d Result %2d %12.5g\n", nrnmpi_myid, i, rhs[i]);
2536  }
2537 #endif
2538  scatter();
2539 }
2540 
2542  int i;
2543  for (i=0; i < n4; ++i) { rhs[i] = 0.0; }
2544  for (i=0; i < nmap; ++i) { rhs[irmap[i]] += *rmap[i]; }
2545 #if 0
2546 for (i=0; i < nmap; ++i){
2547 printf("%d %p gather i=%d ie=%d %g\n", nrnmpi_myid, this, i, irmap[i], *rmap[i]);
2548 }
2549 #endif
2550 }
2551 
2553  int i;
2554  // scatter the result for this host
2555  // fill the send buffer for other hosts
2556 #if 0
2557 for (i=0; i < nsmap; i += 2) {
2558 printf("%d enter scatter %d %p %g %p %g\n", nrnmpi_myid, i,
2559 smap[i], *smap[i], smap[i+1], *smap[i+1]);
2560 }
2561 #endif
2562 //printf("nsmap=%d\n", nsmap);
2563  for (i=0; i < nsmap; i += 2) {
2564 //printf("i=%d ismap[i]=%d\n", i, ismap[i]);
2565  *smap[i] = 1e30*rhs[ismap[i]];
2566  *smap[i+1] = 1e30;
2567  }
2568 #if 0
2569 for (i=0; i < nsmap; i += 2) if (i > 10){
2570 printf("%d leave scatter %d %p %g %p %g\n", nrnmpi_myid, i,
2571 smap[i], *smap[i], smap[i+1], *smap[i+1]);
2572 }
2573 #endif
2574 }
2575 
2576 void ReducedTree::pr_map(int tsize, double* trbuf) {
2577  int i;
2578  Printf(" rmap\n");
2579  for (i=0; i < nmap; ++i) {
2580  for (int it=0; it < nrn_nthread; ++it) {
2581  NrnThread* nt = nrn_threads + it;
2582  MultiSplitThread& t = msc_->mth_[it];
2583  int nb = t.backbone_end - t.backbone_begin;
2584  if (rmap[i] >= trbuf && rmap[i] < (trbuf + tsize)) {
2585  Printf(" %2d rhs[%2d] += tbuf[%ld]\n", i, irmap[i], rmap[i]-trbuf);
2586  }
2587  if (rmap[i] >= nt->_actual_rhs && rmap[i] < (nt->_actual_rhs + nt->end)) {
2588  Node* nd = nt->_v_node[rmap[i] - nt->_actual_rhs];
2589  Printf(" %2d rhs[%2d] rhs[%d] += rhs[%ld] \t%s{%d}\n", i, irmap[i], irmap[i], rmap[i] - nt->_actual_rhs, secname(nd->sec), nd->sec_node_index_);
2590  }
2591  if (rmap[i] >= nt->_actual_d && rmap[i] < (nt->_actual_d + nt->end)) {
2592  Printf(" %2d rhs[%2d] d[%d] += d[%ld]\n", i, irmap[i], irmap[i] - n, rmap[i] - nt->_actual_d);
2593  }
2594  if (rmap[i] >= t.sid1A && rmap[i] < (t.sid1A + nb)) {
2595  Printf(" %2d rhs[%2d] a[%d] += sid1A[%ld]", i, irmap[i], irmap[i] - 2*n, rmap[i]-t.sid1A);
2596  int j = (rmap[i] - t.sid1A) + t.backbone_begin;
2597  Node* nd = nt->_v_node[j];
2598 Printf(" \tA(%d) %s{%d}", j, secname(nd->sec), nd->sec_node_index_);
2599  Printf("\n");
2600  }
2601  if (rmap[i] >= t.sid1B && rmap[i] < (t.sid1B + nb)) {
2602  Printf(" %2d rhs[%2d] b[%d] += sid1B[%ld]", i, irmap[i], irmap[i] - 3*n, rmap[i]-t.sid1B);
2603  int j = (rmap[i] - t.sid1B) + t.backbone_begin;
2604  Node* nd = nt->_v_node[j];
2605 Printf("\tB(%d) %s{%d}", j, secname(nd->sec), nd->sec_node_index_);
2606  Printf("\n");
2607  }
2608  }
2609  }
2610 }
2611 
2612 void ReducedTree::reorder(int j, int nt, int* mark, int* allbbr, int* allsid) {
2613  // specify ip and modify s2rt so they have tree order consistency
2614  // there should be n-1 edges, so
2615  if (n == 1) {
2616  ip[0] = -1;
2617  return;
2618  }
2619  int* e1 = new int[n-1];
2620  int* e2 = new int[n-1];
2621  int* order = new int[n];
2622  int* sid = new int[n];
2623  int singlesid = -1;
2624  int i;
2625  for (i=0; i < n; ++i) { order[i] = -1; }
2626  // fill in the edges.
2627  int ne = n - 1;
2628  int ie = 0;
2629  for (i = 0; i < nt; ++i) {
2630  if (mark[i] == j && allbbr[i] == 2) { singlesid = allsid[i]; }
2631  if (mark[i] == j && allbbr[i] > 2 && allsid[i] < allbbr[i]-3) {
2632 //printf("i=%d ie=%d ne=%d mark=%d allsid=%d allbbr=%d\n", i, ie, ne, mark[i], allsid[i], allbbr[i]-3);
2633  assert(ie < ne);
2634  const auto& e1ieiter = s2rt->find(allsid[i]);
2635  nrn_assert(e1ieiter != s2rt->end());
2636  e1[ie] = e1ieiter->second;
2637  sid[e1[ie]]= allsid[i];
2638  const auto& e2ieiter = s2rt->find(allbbr[i]-3);
2639  nrn_assert(e2ieiter != s2rt->end());
2640  e2[ie] = e2ieiter->second;
2641  sid[e2[ie]] = allbbr[i]-3;
2642  ++ie;
2643  }
2644  }
2645  assert(ie == ne);
2646  if (ne == 0) { assert(singlesid >= 0); sid[0] = singlesid; }
2647  // order the elements starting at 0 // also fill in the ip vector
2648  // should replace by a O(n) or at worst a O(nlogn) algorithm
2649  ip[0] = -1;
2650  order[0] = 0;
2651  int ordered = 1;
2652  while (ordered < n) {
2653  int old = ordered;
2654  for (i=0; i < ne; ++i) {
2655  if (e1[i] >= 0) {
2656  if (order[e1[i]] >= 0) {
2657  assert(order[e2[i]] == -1);
2658  ip[ordered] = order[e1[i]];
2659  order[e2[i]] = ordered++;
2660  e1[i] = -1; // edge handled
2661  e2[i] = -1;
2662  }else if (order[e2[i]] >= 0) {
2663  assert(order[e1[i]] == -1);
2664  ip[ordered] = order[e2[i]];
2665  order[e1[i]] = ordered++;
2666  e1[i] = -1; // edge handled
2667  e2[i] = -1;
2668  }
2669  }
2670  }
2671  assert(ordered > old); // if tree then accomplished something
2672  }
2673 
2674  // reset the s2rt ranks for the sids
2675  for (i=0; i < n; ++i) {
2676  (*s2rt)[sid[i]] = order[i];
2677  }
2678 
2679 #if 0
2680  printf("%d ReducedTree n=%d nmap=%d\n", nrnmpi_myid, n, nmap);
2681  for (i=0; i < n; ++i) {
2682  printf("%d i=%d sid=%d sid2i=%d\n", nrnmpi_myid, i, sid[i], (*s2rt)[sid[i]]);
2683  }
2684  for (i=0; i < n; ++i) {
2685  printf("%d i=%d ip=%d\n", nrnmpi_myid, i, ip[i]);
2686  }
2687 #endif
2688 
2689  delete [] e1;
2690  delete [] e2;
2691  delete [] order;
2692  delete [] sid;
2693 }
2694 
2695 void ReducedTree::fillrmap(int sid1, int sid2, double* pd) {
2696  const auto& sid1_iter = s2rt->find(sid1);
2697  nrn_assert(sid1_iter != s2rt->end());
2698  const int i = sid1_iter->second;
2699  int j;
2700 
2701  // type order is RHS, D, A, B
2702  if (sid2 < 0) { // RHS
2703  j = i;
2704  }else if (sid2 == sid1) { // D
2705  j = i + n;
2706  }else{ // A or B?
2707  const auto& sid2_iter = s2rt->find(sid2);
2708  nrn_assert(sid2_iter != s2rt->end());
2709  j = sid2_iter->second;
2710  if (ip[i] == j) { // A
2711  j = i + 2*n;
2712  }else if (ip[j] == i) { // B
2713  j = j + 3*n;
2714  }else{
2715  assert(0);
2716  }
2717  }
2718  // in most cases we could have done RHS and D together since they
2719  // are adjacent in the receive buffer. However they are not adjacent
2720  // for the sids on this machine.
2721 //printf("%d fillrmap sid1=%d sid2=%d i=%d j=%d irfill=%d\n", nrnmpi_myid, sid1, sid2, i, j, irfill);
2722  irmap[irfill] = j;
2723  rmap[irfill] = pd;
2724  rmap2smap_index[irfill] = nsmap;
2725  irfill += 1;
2726 }
2727 
2728 void ReducedTree::fillsmap(int sid, double* prhs, double* pd) {
2729  const auto& sid_iter = s2rt->find(sid);
2730  nrn_assert(sid_iter != s2rt->end());
2731  const int i = sid_iter->second;
2732 
2733 //printf("%d fillsmap sid=%d i=%d nsmap=%d\n", nrnmpi_myid, sid, i, nsmap);
2734  ismap[nsmap] = i;
2735  smap[nsmap] = prhs;
2736  ismap[nsmap+1] = i;
2737  smap[nsmap+1] = pd;
2738  nsmap += 2;
2739 }
2740 
2741 // triangularize all subtrees, results in a backbone tri-diagonal
2743  int i, ip;
2744  double p;
2745  // eliminate a of the subtrees
2746  for (i=i3 - 1; i >= backbone_end; --i) {
2747  ip = _nt->_v_parent_index[i];
2748  p = A(i) / D(i);
2749  D(ip) -= p * B(i);
2750  RHS(ip) -= p * RHS(i);
2751  }
2752 #if 0
2753 printf("end of triang_subtree2backbone\n");
2754 for (i=i1; i < backbone_end; ++i) {
2755  printf("i=%d D=%g RHS=%g\n", i, D(i), RHS(i));
2756 }
2757 #endif
2758 }
2759 
2760 //backbone starts as tridiagonal, ends up in form of N.
2762  int i, j, ip, jp;
2763  double p;
2764  // begin the backbone triangularization. This eliminates a and fills in
2765  // sid1A column. Begin with pivot equation adjacent to sid1.
2766  for (i=backbone_sid1_begin; i < backbone_end; ++i) {
2767  // what is the equation index for A(i)
2768  j = _nt->_v_parent_index[i] - backbone_begin;
2769  S1A(j) = A(i);
2770  }
2771  for (i = backbone_sid1_begin-1; i >= backbone_interior_begin; --i) {
2772  ip = _nt->_v_parent_index[i];
2773  j = i - backbone_begin;
2774  jp = ip - backbone_begin;
2775  p = A(i) / D(i);
2776  D(ip) -= p * B(i);
2777  RHS(ip) -= p * RHS(i);
2778  S1A(jp) = -p * S1A(j);
2779 //printf("iter i=%d ip=%d j=%d jp=%d D(ip)=%g RHS(ip)=%g S1A(ip)=%g\n",
2780 //i, ip, j, jp, D(ip), RHS(ip), S1A(jp));
2781  }
2782 
2783  // complete the backbone triangularization to form the N shaped matrix
2784  // This eliminates b and fills in the sid1B column. Begin with
2785  // pivot equation adjacent to sid0 (the root)
2786  for (i = backbone_interior_begin; i < backbone_sid1_begin; ++i) {
2787  ip = _nt->_v_parent_index[i];
2788  j = i - backbone_begin;
2789  if (ip < backbone_interior_begin) {
2790  S1B(j) = B(i);
2791  continue;
2792  }
2793  jp = ip - backbone_begin;
2794  p = B(i)/D(ip);
2795  RHS(i) -= p * RHS(ip);
2796  S1A(j) -= p * S1A(jp);
2797  S1B(j) = -p * S1B(jp);
2798 //printf("iter2 i=%d j=%d D(i)=%g RHS(i)=%g S1A(j)=%g S1B(j)=%g\n",
2799 //i, j, D(i), RHS(i), S1A(j), S1B(j));
2800  }
2801  // exactly like above but S1A happens to be D
2802  for (i = backbone_sid1_begin; i < backbone_end; ++i) {
2803  ip = _nt->_v_parent_index[i];
2804  j = i - backbone_begin;
2805  if (ip < backbone_interior_begin) {
2806  S1B(j) = B(i);
2807  continue;
2808  }
2809  jp = ip - backbone_begin;
2810  p = B(i)/D(ip);
2811  RHS(i) -= p * RHS(ip);
2812  D(i) -= p * S1A(jp);
2813  S1B(j) = -p * S1B(jp);
2814 //printf("iter3 i=%d j=%d D(i)=%g RHS(i)=%g S1B(j)=%g\n",
2815 //i, j, D(i), RHS(i), S1B(j));
2816  }
2817 #if 0
2818 printf("%d end of triang_backbone\n", nrnmpi_myid);
2819 for (i=i1; i < backbone_end; ++i) {
2820  printf("%d i=%d D=%g RHS=%g\n", nrnmpi_myid, i, D(i), RHS(i));
2821 }
2822 #endif
2823 }
2824 
2825 // exchange of d and rhs of sids has taken place and we can solve for the
2826 // backbone nodes
2828  int i, j, ip, ip1, ip2;
2829  double a, b, p, vsid1;
2830  // need to solve the 2x2 consisting of sid0 and sid1 points
2831  // this part has already been done for short backbones
2832  j = backbone_long_sid1_begin;
2833 //printf("%d, backbone %d %d %d %d %d %d\n", nrnmpi_myid,
2834 //backbone_begin, backbone_long_begin, backbone_interior_begin,
2835 //backbone_sid1_begin, backbone_long_sid1_begin, backbone_end);
2836  for (i=backbone_long_begin; i < backbone_interior_begin; ++i) {
2837  a = S1A(i - backbone_begin);
2838  b = S1B(j - backbone_begin);
2839  p = b/D(i);
2840  D(j) -= p * a;
2841  RHS(j) -= p * RHS(i);
2842  RHS(j) /= D(j);
2843  RHS(i) -= a * RHS(j);
2844  RHS(i) /= D(i);
2845  ++j;
2846  }
2847  // now sid0 and sid1 values give us the column values
2848  // do in two sweeps. Wish we could keep a single cell backbone
2849  // contiguous.
2850  // Do the S1A sweep on a per cell basis.
2851  for (i = backbone_sid1_begin; i < backbone_end; ++i) {
2852  vsid1 = RHS(i);
2853  for (j = _nt->_v_parent_index[i]; j >= backbone_interior_begin; j = _nt->_v_parent_index[j]) {
2854  RHS(j) -= S1A(j - backbone_begin) * vsid1;
2855  }
2856  }
2857  // For the S1B sweep we use sid0 root info for each interior node
2858  for (i = backbone_interior_begin; i < backbone_sid1_begin; ++i) {
2859  j = i - backbone_begin;
2860  RHS(i) -= S1B(j) * RHS(sid0i[j]);
2861  RHS(i) /= D(i);
2862  }
2863 #if 0
2864 printf("%d end of bksub_backbone\n", nrnmpi_myid);
2865 for (i=i1; i < backbone_end; ++i) {
2866  printf("%d i=%d RHS=%g\n", nrnmpi_myid, i, RHS(i));
2867 }
2868 #endif
2869 }
2870 
2872  int i, j;
2873  double a, b, p;
2874  // solve the 2x2 consisting of sid0 and sid1 points.
2875  j = backbone_sid1_begin;
2876  for (i=backbone_begin; i < backbone_long_begin; ++i) {
2877  a = S1A(i - backbone_begin);
2878  b = S1B(j - backbone_begin);
2879 #if 0
2880 printf("%d part1 i=%d j=%d\n",
2881 nrnmpi_myid, i, j);
2882 printf("%d part1 d=%12.5g a=%12.5g rhs=%12.5g\n",
2883 nrnmpi_myid,D(i), a, RHS(i));
2884 printf("%d part1 b=%12.5g d=%12.5g rhs=%12.5g\n",
2885 nrnmpi_myid, b, D(j), RHS(j));
2886 #endif
2887  p = b/D(i);
2888  D(j) -= p * a;
2889  RHS(j) -= p * RHS(i);
2890  RHS(j) /= D(j);
2891  RHS(i) -= a * RHS(j);
2892  RHS(i) /= D(i);
2893 #if 0
2894 printf("%d part1 result %12.5g %12.5g\n",
2895 nrnmpi_myid, RHS(i), RHS(j));
2896 #endif
2897  ++j;
2898  }
2899  // Note that it no longer makes sense to add D and RHS
2900  // to the other machines since RHS is the solution itself
2901  // at the other machine corresponding points. In fact
2902  // we don't need to send D at all. Just replace the
2903  // RHS, skip that equation, and go on. But for
2904  // code uniformity and since skipping is more complex
2905  // it seems simpler to just setup the right equation
2906  // on the other machine. i.e. 1 * v = rhs
2907 }
2908 
2909 //solve the subtrees, rhs on the backbone is already solved
2911  int i, ip;
2912  // solve all rootnodes not part of a backbone
2913  for (i=i1; i < backbone_begin; ++i) {
2914  RHS(i) /= D(i);
2915  }
2916  // solve the subtrees
2917  for (i=backbone_end; i < i3; ++i) {
2918  ip = _nt->_v_parent_index[i];
2919  RHS(i) -= B(i) * RHS(ip);
2920  RHS(i) /= D(i);
2921  }
2922 #if 0
2923 printf("%d end of bksub_subtrees (just the backbone)\n", nrnmpi_myid);
2924 for (i=i1; i < backbone_end; ++i) {
2925  printf("%d i=%d RHS=%g\n", nrnmpi_myid, i, RHS(i));
2926 }
2927 #endif
2928 }
2929 
2930 // fill the v_node, v_parent node vectors in the proper order and
2931 // determine backbone index values. See the NOTE above. The relevant order
2932 // is:
2933 // 1) all the roots of cells with 0 or 1 sid (no backbone involved)
2934 // 2) all the sid0 when there are two sids in a cell. 1) + 2) = rootnodecount
2935 // 3) the interior backbone nodes (does not include sid0 or sid1 nodes)
2936 // 4) all the sid1 nodes
2937 // 5) all remaining nodes
2938 // backbone_begin is the index for 2).
2939 // backbone_long_begin ; earlier are short backbones, later are long
2940 // backbone_interior_begin is one more than the last index for 2)
2941 // backbone_sid1_begin is the index for 4)
2942 // backbone_long_sid1_begin ; ealier are short, later are long
2943 // backbone_end is the index for 5.
2944 // We allow the single or pair sid nodes to be different from a rootnode.
2945 // Note: generally speaking, the first sid will probably always be
2946 // a classical root node. However, we do want to handle the case where
2947 // many branches connect to soma(0.5), so do the whole tree ordering
2948 // as the general case.
2949 
2951  msc_->v_setup();
2952 }
2953 
2955  if (!classical_root_to_multisplit_) { return; }
2956  // the typical case is that this comes from the beginning of exchange_setup
2957  // through recalc_diam since exhange_setup needs the proper
2958  // thread nt->_v_node and ms->ithread. Hence anything that
2959  // changes the overall structure
2960  // requires a complete start over from the point prior to splitting.
2961 
2963  assert(!use_sparse13);
2964  int i;
2965  // first time through, nth_ = 0
2966  if (nth_ == 0) {
2967 //printf("MultiSplitControl::v_setup due to multisplit()\n");
2968  assert(mth_ == 0);
2969  nth_ = nrn_nthread;
2970  mth_ = new MultiSplitThread[nth_];
2971  for (i=0; i < nrn_nthread; ++i) {
2972  mth_[i].v_setup(nrn_threads + i);
2973  }
2974  }else{
2975  if (nth_ != nrn_nthread) {
2976 hoc_execerror("ParallelContext.nthread() was changed after ParallelContext.multisplit()",0);
2977  }
2978  // pointers have changed and need to reorder and update reduced tree maps
2979 //printf("MultiSplitControl::v_setup due to pointer change\n");
2980  for (i=0; i < nrn_nthread; ++i) {
2981  mth_[i].v_setup(nrn_threads + i);
2982  }
2983  }
2984 }
2985 
2987  // Precondition:
2988  // v_node and v_parent node vectors are in their classical tree order
2989  // and Node.v_node_index also is consistent with that.
2990  // calls to pc.multisplit() define the implicit connections
2991  // between splitcell
2992  // Postcondition: v_node and v_parent node vectors are in their multisplit
2993  // tree order. That info can be subsequently used to re-allocate
2994  // the actual v
2995  i1 = 0;
2996  i2 = i1 + nt->ncell;
2997  i3 = nt->end;
2998  int nnode = i3 - i1;
2999 
3000  MultiSplitTable* classical_root_to_multisplit_ = msc_->classical_root_to_multisplit_.get();
3001  MultiSplitList* multisplit_list_ = msc_->multisplit_list_;
3002 
3003  // node vs v_node relation is always with an i1 offset
3004  Node** node = new Node*[nnode];
3005  Node** parent = new Node*[nnode];
3006  Node* nd;
3007  int i, j, i0, ii, in, ip, nback, ib, ibl, ibs;
3008  int is0, is1, k0, k1, iss0, iss1, isl0, isl1;
3009 
3010 #if 0
3011 printf("multisplit_v_setup %d\n", nnode);
3012  for (i=i1; i < i3; ++i) {
3013  assert(nt->_v_node[i]->v_node_index == i);
3014  assert(nt->_v_parent[i] == 0 || nt->_v_parent[i]->v_node_index < i);
3015  }
3016 #endif
3017 #if 0
3018 printf("multisplit_v_setup %d\n", nnode);
3019 printf("\nclassical order\n");
3020 for (i=i1; i < i3; ++i) {
3021  printf("%d %d %d %s %d\n", i, nt->_v_node[i]->v_node_index,
3022  nt->_v_parent[i] ? nt->_v_parent[i]->v_node_index : -1, secname(nt->v_node[i]->sec),
3023  nt->_v_node[i]->sec_node_index_);
3024 }
3025 #endif
3026  del_sidA();
3027 
3028  // the first phase is to transform from the classical tree order
3029  // to a tree order where the sid0 are roots. In splitcell, this
3030  // was done at the hoc level (rerooting a tree involves reversing
3031  // the parent child relationship on the line between the old and
3032  // new roots). However at the hoc level it is only possible to
3033  // have split points at section boundaries and
3034  // that meant that soma(0.5) could not
3035  // be a root. We want to allow that. It is also much simpler
3036  // to deal with the backbone between sid0 and sid1
3037  // if sid0 is already a root.
3038 
3039  // how many sid0 backbone (long and short) roots are there
3040  backbone_begin = i2;
3041  backbone_long_begin = backbone_begin;
3042  if (classical_root_to_multisplit_) {
3043  MultiSplit* ms;
3044  for (int ii=0; ii < multisplit_list_->count(); ++ii) {
3045  ms = multisplit_list_->item(ii);
3046  if (ms->nd[1]) {
3047  if (ms->nd[1]->_nt != nt) { continue; }
3048  --backbone_begin;
3049  if (ms->backbone_style != 1) {
3050  --backbone_long_begin;
3051  }
3052  if (ms->backbone_style == 2) {
3053  ++nbackrt_;
3054  }
3055  }
3056  }
3057  }
3058  backbone_interior_begin = i2;
3059  if (nbackrt_) {
3060  backsid_ = new int[nbackrt_];
3061  backAindex_ = new int[nbackrt_];
3062  backBindex_ = new int[nbackrt_];
3063  }
3064 
3065  // reorder the trees so sid0 is the root, at least with respect to v_parent
3066  for (i = i1; i < i2; ++i) {
3067  Node* oldroot = nt->_v_node[i];
3068  if (classical_root_to_multisplit_) {
3069  const auto& msiter = classical_root_to_multisplit_->find(oldroot);
3070  if (msiter != classical_root_to_multisplit_->end()) {
3071  nd = msiter->second->nd[0];
3072  if (nd == oldroot) { // the cell tree is fine
3073  // the way it is (the usual case)
3074  nt->_v_parent[nd->v_node_index] = 0;
3075  } else {
3076  // need to reverse the parent/child relationship
3077  // on the path from nd to oldroot
3078  in = nd->v_node_index;
3079  nd = 0;
3080  while (in > i) {
3081  ip = nt->_v_parent[in]->v_node_index;
3082  nt->_v_parent[in] = nd;
3083  nd = nt->_v_node[in];
3084  in = ip;
3085  }
3086  nt->_v_parent[in] = nd;
3087  }
3088  }
3089  }
3090  }
3091 
3092  // Now the only problem is that assert(v_parent[i] < v_node[i])
3093  // is false. But we can take care of that at the end.
3094  // the sid0 are now root nodes with respect to v_parent
3095 
3096 #if 0
3097 printf("\nsid0 is a root\n");
3098 for (i=i1; i < i3; ++i) {
3099  printf("%d %d %d %s %d\n", i, nt->_v_node[i]->v_node_index,
3100  nt->_v_parent[i] ? nt->_v_parent[i]->v_node_index : -1,
3101  secname(nt->_v_node[i]->sec), nt->_v_node[i]->sec_node_index_);
3102 }
3103 #endif
3104  // Index all the classical rootnodes and sid rootnodes
3105  // If the classical rootnode has only 1 sid, it
3106  // replaces it as a rootnode. If there are 2 the first sid node
3107  // gets shifted to the backbone_begin indices in the proper
3108  // short or long section.
3109  backbone_end = i2;
3110  ii = i1;
3111  ibl = backbone_long_begin;
3112  ibs = backbone_begin;
3113  for (i = i1; i < i2; ++i) {
3114  nd = nt->_v_node[i];
3115  if(classical_root_to_multisplit_ && classical_root_to_multisplit_->find(nd) != classical_root_to_multisplit_->end()) {
3116  MultiSplit* ms = classical_root_to_multisplit_->operator[](nd);
3117  if (ms->nd[1]) {
3118  ib = ms->backbone_style >= 1 ? ibs : ibl;
3119  node[ib-i1] = ms->nd[0];
3120  parent[ib-i1] = 0;
3121  if (ms->backbone_style >= 1) {++ibs;} else {++ibl;}
3122  // here we can get a bit greedy and
3123  // start counting how many indices are
3124  // needed for the backbones
3125  // how many nodes between nd[0] and nd[1]
3126  // note that nd[0] IS a root.
3127  i0 = ms->nd[0]->v_node_index;
3128  int iii = ms->nd[1]->v_node_index;
3129  while(i0 != iii) {
3130  iii = nt->_v_parent[iii]->v_node_index;
3131  ++backbone_end;
3132  }
3133  }else{
3134  node[ii-i1] = ms->nd[0];
3135  parent[ii-i1] = 0;
3136  ++ii;
3137  }
3138  }else{
3139  node[ii-i1] = nd;
3140  parent[ii-i1] = 0;
3141  ++ii;
3142  }
3143  }
3144  backbone_sid1_begin = backbone_end - (backbone_interior_begin - backbone_begin);
3145  backbone_long_sid1_begin = backbone_sid1_begin + (backbone_long_begin - backbone_begin);
3146 //printf("backbone begin=%d interior=%d sid1=%d end=%d\n",
3147 //backbone_begin, backbone_interior_begin, backbone_sid1_begin, backbone_end);
3148 #if 0
3149  for (i=i1; i < backbone_interior_begin; ++i) {
3150  assert(parent[i-i1] == 0);
3151  }
3152 #endif
3153  nback = backbone_end - backbone_begin;
3154  if (nback) {
3155  sid0i = new int[nback];
3156  sid1A = new double[nback];
3157  sid1B = new double[nback];
3158  for (i=0; i < nback; ++i) {
3159  sid1A[i] = sid1B[i] = 0.;
3160  sid0i[i] = -1;
3161  }
3162  }
3163  iss0 = backbone_begin;
3164  iss1 = backbone_sid1_begin;
3165  isl0 = backbone_long_begin;
3166  isl1 = backbone_long_sid1_begin;
3167  ib = backbone_sid1_begin;
3168  // order backbone_sid1_begin to backbone_begin (same order as
3169  // backbone_begin to backbone_interior)
3170  // again, short are before long
3171  // the backbone interior can be any order
3172  int ibrt = 0;
3173  for (i = i1; i < i2; ++i) {
3174  nd = nt->_v_node[i];
3175  if(classical_root_to_multisplit_) {
3176  MultiSplit * ms;
3177  const auto &msiter = classical_root_to_multisplit_->find(nd);
3178  if (msiter != classical_root_to_multisplit_->end()) {
3179  ms = msiter->second;
3180  ms->ithread = nt->id;
3181  if (ms->nd[1]) {
3182  if (ms->backbone_style >= 1) {
3183  is0 = iss0++;
3184  is1 = iss1++;
3185  } else {
3186  is0 = isl0++;
3187  is1 = isl1++;
3188  }
3189  i0 = ms->nd[0]->v_node_index;
3190  ii = ms->nd[1]->v_node_index;
3191  node[is1 - i1] = ms->nd[1];
3192  parent[is1 - i1] = nt->_v_parent[ii];
3193  ii = nt->_v_parent[ii]->v_node_index;
3194  while (i0 != ii) {
3195  --ib;
3196  node[ib - i1] = nt->_v_node[ii];
3197  parent[ib - i1] = nt->_v_parent[ii];
3198  //printf("sid0i[%d] = %d\n", ib-backbone_begin, is0);
3199  sid0i[ib - backbone_begin] = is0;
3200  ii = nt->_v_parent[ii]->v_node_index;
3201  }
3202  if (ms->backbone_style == 2) {
3203  backsid_[ibrt] = ms->sid[0];
3204  ms->back_index = ibrt;
3205  backAindex_[ibrt] = is0 - backbone_begin;
3206  backBindex_[ibrt] = is1 - backbone_begin;
3207  //printf("backAindex[%d] = %d sid=%d Bindex=%d\n",
3208  //ibrt, backAindex_[ibrt], backsid_[ibrt], backBindex_[ibrt]);
3209  ++ibrt;
3210  }
3211  }
3212  }
3213  }
3214  }
3215 //printf("backbone order complete\n");
3216 
3217  // most of the rest are already in tree order and have the correct
3218  // parents. They just need to be copied into node and parent
3219  // so that the parent index is alway < the node index
3220  // However, sid0 becoming root can cause many places where
3221  // parent_index > node_index and these need to be relocated
3222  for (i=i1; i < i3; ++i) {
3223  // can exploit this variable because it will be set later
3224  nt->_v_node[i]->eqn_index_ = -1;
3225  }
3226  for (i=i1; i < backbone_end; ++i) {
3227 #if 0
3228 printf("backbone i=%d %d %s %d", i, node[i]->v_node_index, secname(node[i]->sec), node[i]->sec_node_index_);
3229 printf(" -> %s %d\n", parent[i]?secname(parent[i]->sec):"root",
3230 parent[i]?parent[i]->sec_node_index_:-1);
3231 #endif
3232  node[i-i1]->eqn_index_ = i;
3233  }
3234  // the rest are in order
3235  j = backbone_end;
3236  for (i = i1; i < i3; ++i) {
3237  k0 = 0;
3238  k1 = i;
3239 //printf("i=%d\n", i);
3240  while (nt->_v_node[k1]->eqn_index_ < 0) {
3241 //printf("counting i=%d k1=%d k0=%d\n", i, k1, k0);
3242  ++k0;
3243  if (!nt->_v_parent[k1]) { break; }
3244  k1 = nt->_v_parent[k1]->v_node_index;
3245  }
3246 //printf("i=%d k0=%d\n", i, k0);
3247  k1 = i;
3248  j += k0;
3249  k0 = j - 1;
3250  while(nt->_v_node[k1]->eqn_index_ < 0) {
3251 //printf("ordering i=%d k1=%d k0=%d parent=%p\n", i, k1, k0, nt->_v_parent[k1]);
3252  node[k0-i1] = nt->_v_node[k1];
3253  parent[k0-i1] = nt->_v_parent[k1];
3254  node[k0-i1]->eqn_index_ = k0;
3255  --k0;
3256  if (!nt->_v_parent[k1]) { break; }
3257  k1 = nt->_v_parent[k1]->v_node_index;
3258  }
3259  }
3260  for (i = i1; i < i3; ++i) {
3261  assert(i == node[i-i1]->eqn_index_);
3262  nt->_v_node[i] = node[i-i1];
3263  nt->_v_parent[i] = parent[i-i1];
3264  nt->_v_node[i]->v_node_index = i;
3265  }
3266  delete [] node;
3267  delete [] parent;
3268 
3269 #if 0
3270 printf("\nmultisplit reordering\n");
3271 printf("backbone begin=%d long=%d interior=%d sid1=%d long=%d end=%d\n",
3272 backbone_begin, backbone_long_begin, backbone_interior_begin,
3273 backbone_sid1_begin, backbone_long_sid1_begin, backbone_end);
3274 for (i=i1; i < i3; ++i) {
3275  printf("%d %d %s %d ", nt->_v_node[i]->v_node_index,
3276  nt->_v_parent[i] ? nt->_v_parent[i]->v_node_index : -1,
3277  secname(nt->_v_node[i]->sec), nt->_v_node[i]->sec_node_index_);
3278  if (nt->_v_parent[i]) {
3279  printf(" -> %s %d\n", secname(nt->_v_parent[i]->sec), nt->_v_parent[i]->sec_node_index_);
3280  }else{
3281  printf(" root\n");
3282  }
3283 }
3284 #endif
3285 for (i=i1; i < i3; ++i) if (nt->_v_parent[i] && nt->_v_parent[i]->v_node_index >= i) {
3286 printf("i=%d parent=%d\n", i, nt->_v_parent[i]->v_node_index);
3287 }
3288  for (i=i1; i < i3; ++i) {
3289  assert(nt->_v_node[i]->v_node_index == i);
3290  assert(nt->_v_parent[i] == 0 || nt->_v_parent[i]->v_node_index < i);
3291  }
3292 
3293  // freeing sid1A... invalidated any offdiag_ pointers.
3294  for (i=0; i < msc_->nthost_; ++i) {
3295  MultiSplitTransferInfo& msti = msc_->msti_[i];
3296  for (j = 0; j < msti.nnode_rt_; j += 2) {
3297  msti.offdiag_[j] = sid1A + msti.ioffdiag_[j];
3298  msti.offdiag_[j+1] = sid1B + msti.ioffdiag_[j+1];
3299  }
3300  }
3301  // sid1A, sid1B pointers in reducedtree map will be updated by
3302  // rt_map_update along with d and rhs
3303 }
3304 
3305 void MultiSplitControl::pmat(bool full) {
3306 int it, i, ip, is;
3307 Printf("\n");
3308  for (it=0; it < nrn_nthread; ++it) {
3309  NrnThread* _nt = nrn_threads + it;
3310  MultiSplitThread& t = mth_[it];
3311  for (i=0; i < _nt->end; ++i) {
3312  is = _nt->_v_node[i]->_classical_parent ? _nt->_v_node[i]->sec_node_index_ : -1;
3313  Printf("%d %d %s %d", _nt->_v_node[i]->v_node_index,
3314  _nt->_v_parent[i] ? _nt->_v_parent[i]->v_node_index : -1,
3315  secname(_nt->_v_node[i]->sec), is);
3316  if (_nt->_v_parent[i]) {
3317  is = _nt->_v_parent[i]->_classical_parent ? _nt->_v_parent[i]->sec_node_index_ : -1;
3318  Printf(" -> %s %d", secname(_nt->_v_parent[i]->sec), is);
3319  Printf("\t %10.5g %10.5g", NODEB(_nt->_v_node[i]), NODEA(_nt->_v_node[i]));
3320  }else{
3321  Printf(" root\t\t %10.5g %10.5g", 0., 0.);
3322  }
3323 
3324  if (full) {
3325  Printf(" %10.5g %10.5g", NODED(_nt->_v_node[i]), NODERHS(_nt->_v_node[i]));
3326  if (t.sid0i && i >= t.backbone_begin && i < t.backbone_end) {
3327  Printf(" %10.5g %10.5g", t.S1B(i-t.backbone_begin), t.S1A(i-t.backbone_begin));
3328  }
3329  }
3330  Printf("\n");
3331  }
3332  }
3333 }
3334 
3335 void MultiSplitControl::pmatf(bool full) {
3336 int it, i, ip, is;
3337 FILE* f; char fname[100];
3338 
3339  sprintf(fname, "pmat.%04d", nrnmpi_myid);
3340  f = fopen(fname, "w");
3341  for (it=0; it < nrn_nthread; ++it) {
3342  NrnThread* _nt = nrn_threads + it;
3343  MultiSplitThread& t = mth_[it];
3344  fprintf(f, "%d %d\n", it, _nt->end);
3345  for (i=0; i < _nt->end; ++i) {
3346  is = _nt->_v_node[i]->_classical_parent ? _nt->_v_node[i]->sec_node_index_ : -1;
3347  fprintf(f,"%d %d %s %d", _nt->_v_node[i]->v_node_index,
3348  _nt->_v_parent[i] ? _nt->_v_parent[i]->v_node_index : -1,
3349  secname(_nt->_v_node[i]->sec), is);
3350  if (_nt->_v_parent[i]) {
3351  is = _nt->_v_parent[i]->_classical_parent ? _nt->_v_parent[i]->sec_node_index_ : -1;
3352  fprintf(f, " -> %s %d", secname(_nt->_v_parent[i]->sec), is);
3353  fprintf(f, "\t %10.5g %10.5g", NODEB(_nt->_v_node[i]), NODEA(_nt->_v_node[i]));
3354  }else{
3355  fprintf(f, " root\t\t %10.5g %10.5g", 0., 0.);
3356  }
3357 
3358  if (full) {
3359  fprintf(f, " %10.5g %10.5g", NODED(_nt->_v_node[i]), NODERHS(_nt->_v_node[i]));
3360  if (t.sid0i && i >= t.backbone_begin && i < t.backbone_end) {
3361  fprintf(f, " %10.5g %10.5g", t.S1B(i-t.backbone_begin), t.S1A(i-t.backbone_begin));
3362  }
3363  }
3364  fprintf(f, "\n");
3365  }
3366  }
3367  fclose(f);
3368 }
3369 
3370 void MultiSplitControl::pmat1(const char* s) {
3371  int it;
3372  double a, b, d, rhs;
3373  MultiSplit* ms;
3374  for (it=0; it < nrn_nthread; ++it) {
3375  NrnThread* _nt = nrn_threads + it;
3376  MultiSplitThread& t = mth_[it];
3377  int i1 = 0;
3378  int i3 = _nt->end;
3379  for (int ii=0; ii < multisplit_list_->count(); ++ii) {
3380  ms = multisplit_list_->item(ii);
3381  int i = ms->nd[0]->v_node_index;
3382  if (i < i1 || i >= i3) { continue; }
3383  d = D(i);
3384  rhs = RHS(i);
3385  a = b = 0.;
3386  if (ms->nd[1]) {
3387  a = mth_[it].S1A(0);
3388  }
3389  Printf("%2d %s sid=%d %12.5g %12.5g %12.5g %12.5g\n", nrnmpi_myid,
3390  s, ms->sid[0], b, d, a, rhs);
3391  if (ms->nd[1]) {
3392  d = D(ms->nd[1]->v_node_index);
3393  rhs = RHS(ms->nd[1]->v_node_index);
3394  a = 0.;
3395  b = t.S1B(t.backbone_sid1_begin - t.backbone_begin);
3396  Printf("%2d %s sid=%d %12.5g %12.5g %12.5g %12.5g\n", nrnmpi_myid,
3397  s, ms->sid[1], b, d, a, rhs);
3398  }
3399  }
3400  }
3401 }
3402 
3404  int i = nd->v_node_index;
3405  Node* pnd = nd->_classical_parent;
3406  NrnThread* _nt = nd->_nt;
3407  if (_nt->_v_parent[i] == pnd) {
3408  return &NODEA(nd);
3409  } else if (pnd == 0) {
3410  return 0;
3411  } else if (_nt->_v_parent[pnd->v_node_index] == nd) {
3412  return &NODEB(pnd);
3413  } else {
3414  assert(0);
3415  }
3416  return 0;
3417 }
3418 
3420  int i = nd->v_node_index;
3421  Node* pnd = nd->_classical_parent;
3422  NrnThread* _nt = nd->_nt;
3423  if (_nt->_v_parent[i] == pnd) {
3424  return &NODEB(nd);
3425  } else if (pnd == 0) {
3426  return 0;
3427  } else if (_nt->_v_parent[pnd->v_node_index] == nd) {
3428  return &NODEA(pnd);
3429  } else {
3430  assert(0);
3431  }
3432  return 0;
3433 }
ReducedTree * rt_
Definition: multisplit.cpp:271
ReducedTree(MultiSplitControl *, int rank, int mapsize)
#define Printf
Definition: model.h:252
double * _actual_d
Definition: multicore.h:71
void bksub_backbone(NrnThread *)
#define nrn_assert(ex)
Definition: nrnassrt.h:35
void nrn_multisplit_adjust_rhs(NrnThread *)
void fillsmap(int sid, double *prhs, double *pdiag)
double * b
Definition: multisplit.cpp:231
#define assert(ex)
Definition: hocassrt.h:26
#define B(i)
Definition: multisplit.cpp:62
short type
Definition: cabvars.h:10
short nnode
Definition: section.h:41
struct Node * parentnode
Definition: section.h:50
static void nrnmpi_int_allgather(int *, int *, int)
Definition: multisplit.cpp:49
void * nrn_multisplit_triang(NrnThread *)
MultiSplitControl * msc
Definition: multisplit.cpp:225
order
Definition: multicore.cpp:886
static double nrnmpi_splitcell_wait_
Definition: multisplit.cpp:43
struct Section * parentsec
Definition: section.h:42
#define NODED(n)
Definition: section.h:103
std::unique_ptr< MultiSplitTable > classical_root_to_multisplit_
double * nrn_classicalNodeA(Node *nd)
void * nrn_multisplit_bksub(NrnThread *)
void reduced_mark(int, int, int, int *, int *, int *)
int tree_changed
Definition: cabcode.cpp:19
void nrn_multisplit_nocap_v()
void
int * _v_parent_index
Definition: multicore.h:76
double ** smap
Definition: multisplit.cpp:239
Node ** _v_parent
Definition: multicore.h:78
size_t p
void pmatf(bool full=false)
Represent main neuron object computed by single thread.
Definition: multicore.h:58
double * _actual_rhs
Definition: multicore.h:70
void nrn_cachevec(int)
Definition: treeset.cpp:2108
char tbuf[5]
Definition: ddeclnt.cpp:35
static int nrnmpi_use
Definition: multisplit.cpp:48
static void nrnmpi_barrier()
Definition: multisplit.cpp:54
void scatter()
nd
Definition: treeset.cpp:893
struct Node * _classical_parent
Definition: section.h:156
#define v
Definition: md1redef.h:4
MultiSplitTransferInfo * msti_
void nrn_matrix_node_free()
Definition: treeset.cpp:1901
double * a
Definition: multisplit.cpp:230
std::unique_ptr< Int2IntTable > s2rt
Definition: multisplit.cpp:248
int * rmap2smap_index
Definition: multisplit.cpp:243
static philox4x32_key_t k
Definition: nrnran123.cpp:11
sprintf(buf," if (secondorder) {\ " int _i;\" " for(_i=0;_i< %d;++_i) {\" " _p[_slist%d[_i]]+=dt *_p[_dlist%d[_i]];\" " }}\", numeqn, listnum, listnum)
void nrn_multisplit_nocap_v_part1(NrnThread *)
#define e
Definition: passive0.cpp:24
std::unordered_map< Node *, MultiSplit * > MultiSplitTable
Definition: multisplit.cpp:300
void triang(NrnThread *)
j< sec-> nnode
Definition: treeset.cpp:905
#define D(i)
Definition: multisplit.cpp:63
void pmat(bool full=false)
static void nrnmpi_postrecv_doubles(double *, int, int, int, void **)
Definition: multisplit.cpp:51
static void multisplit_v_setup()
int use_sparse13
Definition: treeset.cpp:69
int id
Definition: multicore.h:66
MultiSplitThread * mth_
int ncell
Definition: multicore.h:64
void solve(void)
Definition: nonlin.cpp:287
#define implementPtrList(PtrList, T)
void bksub_subtrees(NrnThread *)
void bksub(NrnThread *)
int nrn_nthread
Definition: multicore.cpp:44
Node ** _v_node
Definition: multicore.h:77
void nrn_multisplit_nocap_v_part3(NrnThread *)
int diam_changed
Definition: cabcode.cpp:23
#define root
Definition: rbtqueue.cpp:53
int nrnmpi_numprocs
double * v
Definition: multisplit.cpp:245
_CONST char * s
Definition: system.cpp:74
void nrn_multisplit_nocap_v_part2(NrnThread *)
NrnThread * nrn_threads
Definition: multicore.cpp:45
#define NODEA(n)
Definition: section.h:123
void triang_subtree2backbone(NrnThread *)
virtual ~ReducedTree()
static MultiSplit MultiSplitControl * msc_
Definition: multisplit.cpp:305
#define VEC_AREA(i)
Definition: section.h:122
#define printf
Definition: mwprefix.h:26
const char * secname(Section *sec)
Definition: cabcode.cpp:1787
int use_cachevec
Definition: treeset.cpp:61
void nrnmpi_multisplit(Section *, double x, int sid, int backbone_style)
Definition: multisplit.cpp:307
struct NrnThread * _nt
Definition: section.h:157
static const char * fname(const char *name)
Definition: nrnbbs.cpp:108
static Node * pnd
Definition: clamp.cpp:33
double * nrn_classicalNodeB(Node *nd)
void reorder(int j, int nt, int *mark, int *all_bb_relation, int *allsid)
void hoc_execerror(const char *, const char *)
Definition: hoc.cpp:741
errno
Definition: system.cpp:98
size_t j
std::unordered_map< int, int > Int2IntTable
Definition: multisplit.cpp:205
void triang_backbone(NrnThread *)
fprintf(stderr, "Don't know the location of params at %p\, pp)
double * rhs
Definition: multisplit.cpp:228
void matrix_exchange_nocap()
void(* nrn_multisplit_setup_)()
Definition: treeset.cpp:46
static double gather(void *v)
void * nrn_multisplit_reduce_solve(NrnThread *)
#define NODEB(n)
Definition: section.h:124
double * pd[3]
Definition: multisplit.cpp:200
#define parent
Definition: rbtqueue.cpp:47
virtual void move(const Event &e)
Definition: ocinput.h:19
virtual ~MultiSplitThread()
Definition: multisplit.cpp:358
#define ne
Definition: redef.h:94
#define VEC_V(i)
Definition: section.h:121
static void nrnmpi_int_allgatherv(int *, int *, int *, int *)
Definition: multisplit.cpp:50
static void nrnmpi_send_doubles(double *, int, int, int)
Definition: multisplit.cpp:52
void pr_map(int, double *)
int end
Definition: multicore.h:65
#define rhs
Definition: lineq.h:6
double nrnmpi_rtcomp_time_
Definition: ocbbs.cpp:33
void bksub_short_backbone_part1(NrnThread *)
void nrnmpi_multisplit_clear()
Definition: multisplit.cpp:500
static double scatter(void *v)
void multisplit_nocap_v_part1(NrnThread *)
#define NODERHS(n)
Definition: section.h:104
MultiSplit * ms
Definition: multisplit.cpp:195
int backbone_style
Definition: multisplit.cpp:258
void multisplit_nocap_v_part3(NrnThread *)
static void multisplit_solve()
int v_node_index
Definition: section.h:174
Section * sec
Definition: section.h:154
int nrnmpi_myid
void v_setup(NrnThread *)
#define S1B(i)
Definition: multisplit.cpp:66
#define i
Definition: md1redef.h:12
#define id
Definition: md1redef.h:33
#define A(i)
Definition: multisplit.cpp:61
int ibuf[3]
Definition: multisplit.cpp:193
double adjust_rhs_
Definition: multisplit.cpp:194
void recalc_diam()
Definition: treeset.cpp:940
void setup_topology()
Definition: cabcode.cpp:1736
static double nrnmpi_wtime()
Definition: multisplit.cpp:55
void multisplit_nocap_v_part2(NrnThread *)
sec
Definition: solve.cpp:885
Node * nd[2]
Definition: multisplit.cpp:256
#define RHS(i)
Definition: multisplit.cpp:64
void multisplit(Section *, double, int, int)
Definition: multisplit.cpp:362
#define VEC_D(i)
Definition: section.h:119
int sec_node_index_
Definition: section.h:176
int nrn_multisplit_active_
Definition: multisplit.cpp:15
double * d
Definition: multisplit.cpp:229
MultiSplit * ms
Definition: multisplit.cpp:202
static Node * node(Object *)
Definition: netcvode.cpp:318
Definition: section.h:132
declarePtrList(MultiSplitList, MultiSplit) implementPtrList(MultiSplitList
char * nh
Definition: mos2nrn.cpp:11
MultiSplitList * multisplit_list_
static void nrnmpi_wait(void **)
Definition: multisplit.cpp:53
int eqn_index_
Definition: section.h:148
int sid[2]
Definition: multisplit.cpp:257
void fillrmap(int sid1, int sid2, double *pd)
double ** rmap
Definition: multisplit.cpp:239
#define VEC_RHS(i)
Definition: section.h:120
FILE * fopen()
#define S1A(i)
Definition: multisplit.cpp:65
double t
Definition: init.cpp:123
double adjust_rhs_
Definition: multisplit.cpp:201
Node * node_exact(Section *sec, double x)
Definition: cabcode.cpp:1956
void nrn_multisplit_ptr_update()
void pmat1(const char *)
void(* nrn_multisplit_solve_)()
Definition: solve.cpp:71
virtual ~MultiSplitControl()
Definition: multisplit.cpp:342
void multisplit_adjust_rhs(NrnThread *)