-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdynamicLB.tex
496 lines (461 loc) · 15.2 KB
/
dynamicLB.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
\comment{
\begin{frame}[fragile]
\frametitle{How to Diagnose Load Imbalance}
\begin{itemize}
\item Often hidden in statements such as:
\begin{itemize}
\item Very high synchronization overhead
\begin{itemize}
\item Most processors are waiting at a reduction
\end{itemize}
\end{itemize}
\item Count total amount of computation (ops/flops) per processor
\begin{itemize}
\item In each phase!
\item Because the balance may change from phase to phase
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Golden Rule of Load Balancing}
\emph{Fallacy: objective of load balancing is to minimize variance in load across processors}
\begin{itemize}
\item[]\emph{Example:}
\begin{itemize}
\item 50,000 tasks of equal size, 500 processors:
\begin{itemize}
\item A: All processors get 99, except last 5 gets $100+99 = 199$
\item OR, B: All processors have 101, except last 5 get 1
\end{itemize}
\end{itemize}
\item[] Identical variance, but situation A is much worse!
\end{itemize}
\emph{Golden Rule: It is ok if a few processors idle, but avoid having processors that are overloaded with work}
\emph{Finish time} = $max_i$(Time on processor $i$)
\begin{itemize}
\item[] excepting data dependence and communication overhead issues
\end{itemize}
The speed of any group is the speed of slowest member of that group.
\end{frame}
}
\begin{frame}[fragile]
\frametitle{Automatic Dynamic Load Balancing}
\begin{itemize}
\item Measurement based load balancers
\begin{itemize}
\item Principle of persistence: In many CSE applications, computational loads and communication patterns tend to persist, even in dynamic computations
\item Therefore, recent past is a good predictor of near future
\item Charm++ provides a suite of load-balancers
\item Periodic measurement and migration of objects
\end{itemize}
\item Seed balancers (for task-parallelism)
\begin{itemize}
\item Useful for divide-and-conquer and state-space-search applications
\item Seeds for charm++ objects moved around until they take root
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Using the Load Balancer}
\begin{itemize}
\item link a LB module
\begin{itemize}
\item \code{-module <strategy>}
\item RefineLB, NeighborLB, GreedyCommLB, others…
\item EveryLB will include all load balancing strategies
\end{itemize}
\item compile time option (specify default balancer)
\begin{itemize}
\item \code{-balancer RefineLB}
\item runtime option
\item \code {+balancer RefineLB}
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}
\frametitle{Code to Use Load Balancing}
\begin{itemize}
\item Insert \code{if (myLBStep) AtSync() else ResumeFromSync();} call at natural barrier
\item Implement \code{ResumeFromSync()} to resume execution
\begin{itemize}
\item Typical ResumeFromSync contribute to a reduction
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Example: Stencil}
\begin{lstlisting}[basicstyle=\tiny]
while (!converged) {
serial {
int x = thisIndex.x, y = thisIndex.y, z = thisIndex.z;
copyToBoundaries();
thisProxy(wrapX(x-1),y,z).updateGhosts(i, RIGHT, dimY, dimZ, right);
/* ...similar calls to send the 6 boundaries... */
thisProxy(x,y,wrapZ(z+1)).updateGhosts(i, FRONT, dimX, dimY, front);
}
for (remoteCount = 0; remoteCount < 6; remoteCount++) {
when updateGhosts[i](int i, int d, int w, int h, double b[w*h])
serial { updateBoundary(d, w, h, b); }
}
serial {
int c = computeKernel() < DELTA;
CkCallback cb(CkReductionTarget(Jacobi, checkConverged), thisProxy);
if (i%5 == 1) contribute(sizeof(int), \&c, CkReduction::logical_and, cb);
}
if (i % lbPeriod == 0) { serial { AtSync(); } when ResumeFromSync() {} }
if (++i % 5 == 0) {
when checkConverged(bool result) serial {
if (result) { mainProxy.done(); converged = true; }
}
}
}
\end{lstlisting}
\end{frame}
\begin{frame}
\frametitle{Performance}
\begin{centering}
\includegraphics[width=0.5\textwidth]{figures/beforeLB}
\includegraphics[width=0.5\textwidth]{figures/afterLB}
\end{centering}
\end{frame}
%\begin{frame}[fragile]
%\frametitle{Grainsize and Load Balancing}
%\begin{itemize}
%\item[] How Much Balance Is Possible?
%\end{itemize}
%\begin{columns}
% \begin{column}[T]{2.8in}
% \includegraphics[width=2.8in, height=2.0in]{figures/histogramGrains}
% \end{column}
%
% \begin{column}[T]{5cm}
% Solution:\\
% Split compute objects that may have too much work,
% using a heuristic based on number of interacting atoms
% \end{column}
%\end{columns}
%\end{frame}
%\begin{frame}[fragile]
%\frametitle{Grainsize For Extreme Scaling}
%\begin{itemize}
% \item Strong Scaling is limited by expressed parallelism
% \begin{itemize}
% \item Minimum iteration time limited by lengthiest computation
% \begin {itemize}
% \item Largest grains set lower bound
% \end{itemize}
% \end{itemize}
% \item 1-away generalized to k-away provides fine granularity control
%\end{itemize}
%\begin{centering}
%\includegraphics[width=1.0\textwidth]{figures/1away2away}
%\end{centering}
%\end{frame}
%
%\begin{frame}[fragile]
%\frametitle{NAMD: 2-AwayX Example}
%\begin{centering}
%\includegraphics[width=1.0\textwidth]{figures/2awayDiagramPlusHistos}
%\end{centering}
%\end{frame}
%
\comment{
\begin{frame}[fragile]
\frametitle{Load Balancing Strategies}
\begin{itemize}
\item Classified by when it is done:
\begin{itemize}
\item Initially
\item Dynamic: Periodically
\item Dynamic: Continuously
\end{itemize}
\item Classified by whether decisions are taken with global information
\begin{itemize}
\item Fully centralized
\begin{itemize}
\item Quite good a choice when load balancing period is high
\end{itemize}
\item Fully distributed
\begin{itemize}
\item Each processor knows only about a constant number of neighbors
\item Extreme case: totally local decision (send work to a random destination processor, with some probability).
\end{itemize}
\item Use \emph{aggregated} global information, and \emph{detailed} neighborhood info.
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Dynamic Load Balancing Scenarios}
\begin{itemize}
\item Examples representing typical classes of situations
\begin{itemize}
\item Particles distributed over simulation space
\begin{itemize}
\item Dynamic: because Particles move.
\item Cases:\\
Highly non-uniform distribution (cosmology)\\
Relatively Uniform distribution
%\begin{description}[parskip]
%\item Highly non-uniform distribution (cosmology)
%\item Relatively Uniform distribution
%\end{description}
\end{itemize}
\end{itemize}
\item Structured grids, with dynamic refinements/coarsening
\item Unstructured grids with dynamic refinements/coarsening
\end{itemize}
\end{frame}
%\begin{frame}[fragile]
%\frametitle{Example Case: Particles}
% Orthogonal Recursive Bisection (ORB)
% %should wrapfig the little diagram here
% \begin{itemize}
% \item At each stage: divide Particles equally
% \item Processor don’t need to be a power of 2:
% \begin{itemize}
% \item Divide in proportion
% \begin{itemize}
% \item 2:3 with 5 processors
% \end{itemize}
% \end{itemize}
% \item How to choose the dimension along which to cut?
% \begin{itemize}
% \item Choose the longest one
% \end{itemize}
% \item How to draw the line?
% \begin{itemize}
% \item All data on one processor? Sort along each dimension
% \item Otherwise: run a distributed histogramming algorithm to find the line, recursively
% \end{itemize}
% \item Find the entire tree, and then do all data movement at once
% \begin{itemize}
% \item Or do it in two-three steps.
% \item But no reason to redistribute particles after drawing each line.
% \end{itemize}
%\end{itemize}
%\end{frame}
\begin{frame}[fragile]
\frametitle{Dynamic Load Balancing using Objects}
Object based decomposition (I.e. virtualized decomposition) helps
\begin{itemize}
\item Allows RTS to remap them to balance load
\item But how does the RTS decide where to map objects?
\item Just move objects away from overloaded processors to underloaded processors
\item How is load determined?
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Measurement Based Load Balancing}
\begin{itemize}
\item \emph{Principle of Persistence}
\begin{itemize}
\item Object communication patterns and computational loads tend to persist over time
\item In spite of dynamic behavior
\begin{itemize}
\item Abrupt but infrequent changes
\item Slow and small changes
\end{itemize}
\end{itemize}
\item Runtime instrumentation
\begin{itemize}
\item Measures communication volume and computation time
\end{itemize}
\item Measurement based load balancers
\begin{itemize}
\item Use the instrumented data-base periodically to make new decisions
\item Many alternative strategies can use the database
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Periodic Load Balancing}
Stop the computation?
Centralized strategies:
\begin{itemize}
\item Charm RTS collects data (on one processor) about:
\begin{itemize}
\item Computational Load and Communication for each pair
\end{itemize}
\item If you are not using AMPI/Charm, you can do the same instrumentation and data collection
\item Partition the graph of objects across processors
\begin{itemize}
\item Take communication into account
\begin{itemize}
\item Pt-to-pt, as well as multicast over a subset
\item As you map an object, add to the load on both sending and receiving processor
\end{itemize}
\item Multicasts to multiple co-located objects are effectively the cost of a single send
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Typical Load Balancing Steps}
\begin{centering}
\includegraphics[width=1.0\textwidth]{figures/LBStepsDiagram}
\end{centering}
\end{frame}
}
\begin{frame}[fragile]
\frametitle{Object Partitioning Strategies}
\begin{itemize}
\item You can use graph partitioners like METIS, K-R
\begin{itemize}
\item BUT: graphs are smaller, and optimization criteria are different
\end{itemize}
\item Greedy strategies:
\begin{itemize}
\item If communication costs are low: use a simple greedy strategy
\begin{itemize}
\item Sort objects by decreasing load
\item Maintain processors in a heap (by assigned load)
\item In each step: \\
%\begin{description}
% \item
assign the heaviest remaining object to the least loaded processor
%\end{description}
\end{itemize}
\item With small-to-moderate communication cost:
\begin{itemize}
\item Same strategy, but add communication costs as you add an object to a processor
\end{itemize}
\item Always add a refinement step at the end:
\begin{itemize}
\item Swap work from heaviest loaded processor to ``some other processor''
\item Repeat a few times or until no improvement
\end{itemize}
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Object Partitioning Strategies 2}
When communication cost is significant:
\begin{itemize}
\item Still use greedy strategy, but:
\begin{itemize}
\item At each assignment step, choose between assigning O to least loaded processor and the processor that already has objects that communicate most with O.
\begin{itemize}
\item Based on the degree of difference in the two metrics
\item Two-stage assignments:\\
In early stages, consider communication costs as long as the processors are
in the same (broad) load “class”,\\
In later stages, decide based on load
\end{itemize}
\end{itemize}
\end{itemize}
Branch-and-bound
\begin{itemize}
\item Searches for optimal, but can be stopped after a fixed time
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Crack Propagation}
\begin{center}
\includegraphics[width=0.35\textwidth]{figures/chunkGraph16}
\includegraphics[width=0.35\textwidth]{figures/chunkGraph128}
\end{center}
\begin{itemize}
\item Decomposition into 16 chunks (left) and 128 chunks, 8 for each PE
(right). Both decompositions obtained using Metis. Pictures: S. Breitenfeld,
and P. Geubelle
\item As computation progresses, crack propagates, and new elements are
added, leading to more complex computations in some chunks
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Load Balancing Crack Propagation}
\begin{centering}
\includegraphics[width=1.0\textwidth]{figures/LButilizationCrackPropWithAnnotation}
\end{centering}
\end{frame}
\begin{frame}[fragile]
\frametitle{Distributed Load balancing}
\begin{itemize}
\item Centralized strategies
\begin{itemize}
\item Still ok for 3000 processors for NAMD
\end{itemize}
\item Distributed balancing is needed when:
\begin{itemize}
\item Number of processors is large and/or
\item load variation is rapid
\end{itemize}
\item Large machines:
\begin{itemize}
\item Need to handle locality of communication
\begin{itemize}
\item Topology sensitive placement
\end{itemize}
\item Need to work with scant global information
\begin{itemize}
\item Approximate or aggregated global information (average/max load)
\item Incomplete global info (only “neighborhood”)
\item Work diffusion strategies (1980s work by Kale and others!)
\end{itemize}
\item Achieving global effects by local action…
\end{itemize}
\end{itemize}
\end{frame}
\comment{
\begin{frame}[fragile]
\frametitle{Load Balancing on Large Machines}
\begin{itemize}
\item Centralized load balancing strategies don’t scale on extremely large machines
\item Limitations of centralized strategies:
\begin{itemize}
\item Central node: memory/communication bottleneck
\item Decision-making algorithms tend to be very slow
\end{itemize}
\item Limitations of distributed strategies:
\begin{itemize}
\item Difficult to achieve well-informed load balancing decisions
\end{itemize}
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Simulation Study - Memory Overhead}
lb\_test experiments performed with the performance simulator BigSim
\includegraphics[width=1.0\textwidth]{figures/LBMemOverhead}
\begin{itemize}
\item lb\_test benchmark is a parameterized program that creates a
specified number of communicating objects in 2D-mesh.
\end{itemize}
\end{frame}
}
\begin{frame}[fragile]
\frametitle{Hierarchical Load Balancers}
\begin{itemize}
\item Partition processor allocation into processor groups
\item Apply different strategies at each level
\item Scalable to a large number of processors
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Our Hybrid Scheme}
\includegraphics[width=1.0\textwidth]{figures/hybridLBScheme}
\end{frame}
\comment{
\begin{frame}[fragile]
\frametitle{Hybrid Load Balancing Performance}
\includegraphics[width=0.5\textwidth]{figures/hybridLBPerf}
\includegraphics[width=0.5\textwidth]{figures/hybridLBquality}
\end{frame}
}
\begin{frame}[fragile]
\frametitle{MetaBalancer - When and how to load balance?}
\begin{itemize}
\item Difficult to find the optimum load balancing period
\begin{itemize}
\item Depends on the application characteristics
\item Depends on the machine the application is run on
\end{itemize}
\item Monitors the application continuously and predicts behavior.
\item Decides when to invoke which load balancer.
\item Command line argument - \texttt{+MetaLB}
\end{itemize}
\end{frame}
\comment{
\begin{frame}[fragile]
\frametitle{Metabalancer Performance for LeanMD mini-App}
\includegraphics{figures/bgp-metabalancer-leanmd}
\end{frame}
}