-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchallenges.tex
218 lines (201 loc) · 7.29 KB
/
challenges.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
\comment{
\begin{frame}[t]
\frametitle{Harnessing Parallelism: Challenges}
\framesubtitle{Trends in System Architecture}
\begin{itemize}
\item Data movement is increasingly expensive
\begin{itemize}
\item (relatively) slow memory
\item (relatively) slow interconnects
\end{itemize}
\item Energy / Power considerations will exacerbate these issues
\pause
\item Compute resources are cheap
\begin{itemize}
\item Smaller footprint on silicon
\item Lower energy share compared to memory + interconnect
\end{itemize}
\end{itemize}
\pause
\begin{block}{Implications}
More available compute cycles for each incoming byte of data
%\pause
%\begin{itemize}
% \item Restructure code to extract all flops from incoming operands
% \item Avoid moving data. Instead recompute!
% \item Commmunication-avoiding algorithms?
%\end{itemize}
\end{block}
\end{frame}
\begin{frame}[shrink]
\frametitle{Harnessing Parallelism: Challenges}
\framesubtitle{Trends in System Architecture}
\begin{columns}
\begin{column}{0.60\textwidth}
\begin{itemize}
\item However, compute resources are not faster cores, but \textbf{more cores}
\item Unprecedented levels of available concurrency
\begin{itemize}
\item IBM BG/Q
\begin{itemize}
\item `Sequoia': 1,572,864 cores
\item `Mira': 786,432 cores
\end{itemize}
\item Cray
\begin{itemize}
\item XE6 `Bluewaters`: $>$ 380,000 cores
\item XK6 `Titan': 299,008 cores
\end{itemize}
\item K Supercomputer: 705,024 cores
\end{itemize}
\end{itemize}
\end{column}
\begin{column}{0.40\textwidth}
\includegraphics[width=1\textwidth]{figures/mira.jpg}
\end{column}
\end{columns}
\pause
\begin{block}{Implications}
\begin{itemize}
\item Each thread of execution has to:
\begin{itemize}
\item operate on lesser data
\item wait relatively longer for remote data
\end{itemize}
\item Have to operate in \textbf{strong scaling} regime
% Even if you don't do anything, an 8GB problem will have to run on
% more cores a few years from now simply because there will be many
% more cores for the same 8GB
% If a product has to stay ahead of the competition, it has to scale
% the same problem to even more cores with time to run faster
\end{itemize}
\end{block}
\end{frame}
\begin{frame}[t]
\frametitle{Harnessing Parallelism: Challenges}
\framesubtitle{Trends in System Architecture}
\begin{itemize}
\item Growing heterogeneity: Each thread of execution has different abilities
\begin{itemize}
\item NUMA cores on each node
\item Secondary threads in SMT can be less capable (IBM Power 7)
\item Accelerators (NVIDIA / AMD GPUs, Intel MIC etc)
\item Some cores run system daemons
\item Combinations of above
\begin{itemize}
\item GPUs only on some nodes (BlueWaters: NCSA)
\item MIC + SMT nodes (Stampede: TACC)
\end{itemize}
\end{itemize}
\pause
\item Each architecture expects different treatment
\begin{itemize}
\item GPUs expect evenly divided pieces
\item SMT might expect uneven pieces (IBM Power7)
\end{itemize}
\end{itemize}
\pause
\begin{block}{Implications}
\begin{itemize}
\item Achieving load balance on such hardware is challenging
\item Minimizing idle time is extremely difficult
\end{itemize}
\end{block}
\end{frame}
\begin{frame}[t]
\frametitle{Harnessing Parallelism: Challenges}
\framesubtitle{Trends in System Architecture}
\begin{itemize}
\item Extracting performance on tighter power / energy budget
\item Hardware component failures / faults
\end{itemize}
\end{frame}
\begin{frame}[fragile]
\frametitle{Harnessing Parallelism: Challenges}
\framesubtitle{Application characteristics}
\begin{itemize}
\item Complex physics in multiple, interacting modules
\item Adaptive, spatial and temporal resolutions
\item Need for faster solutions (not just larger problems)
\end{itemize}
\end{frame}
\begin{frame}[t]
\frametitle{Observations}
\framesubtitle{Exascale Applications}
\begin{itemize}
\item Development of new models must be driven by the needs of exascale applications
\begin{itemize}
\item Multi-resolution
\item Multi-module (multi-physics)
\item Dynamic/adaptive: to handle application variation
\item Adapt to a volatile computational environment
\item Exploit heterogeneous architecture
\item Deal with thermal and energy considerations
\end{itemize}
\pause
\item So? Consequences:
\begin{itemize}
\item Must support automated resource management
\item Must support interoperability and parallel composition
\end{itemize}
\end{itemize}
\end{frame}
}
\begin{frame}[t]
\frametitle{Harnessing Parallelism: Challenges}
\framesubtitle{Complex machines and Sophisticated Applications}
\begin{itemize}
\item Hardware Challenges
\begin{itemize}
\item Large machines (largest have about 2 million cores!)
\item Multicore node
\item Heterogeneity
\item Upcoming: power/energy/thermal considerations, and component failures
\end{itemize}
\item{Applications are getting more sophisticated}
\begin{itemize}
\item Dynamic and adpative refinements
\item Multi-physics codes
\item Need for strong scaling
\end{itemize}
\end{itemize}
\end{frame}
% \begin{frame}[fragile]
% \frametitle{Harnessing Parallelism: Challenges}
% \framesubtitle{2D Jacobi Iterations: 5-point Stencil}
% \begin{center} \includegraphics[width=0.6\textwidth]{figures/2DJacobi_NeighborComm.jpg} \end{center}
% \end{frame}
% \begin{frame}[fragile]
% \frametitle{Harnessing Parallelism: Challenges}
% \framesubtitle{2D Jacobi Iterations: No Overlap}
% \lstinputlisting{code/jacobi2D-mpi.c}
% \end{frame}
% \begin{frame}[fragile, shrink]
% \frametitle{Harnessing Parallelism: Challenges}
% \framesubtitle{2D Jacobi Iterations: Some Overlap}
% \lstinputlisting{code/jacobi2D-mpi-nonblocking.c}
% \end{frame}
\comment{
\begin{frame}[fragile]
\frametitle{Harnessing Parallelism: Challenges}
\framesubtitle{Composing Independent Parallel Modules}
\frametitle{Harnessing Parallelism}
\framesubtitle{Composing Independent Parallel Modules}
\begin{itemize}
\item Sequential blocks (SEBs) in two modules can be partitioned in space
\end{itemize}
\begin{center}
\includegraphics[width=0.8\textwidth]{figures/spaceDivision.pdf}
\end{center}
\end{frame}
\begin{frame}[fragile]
\frametitle{Harnessing Parallelism: Challenges}
\framesubtitle{Composing Independent Parallel Modules}
\begin{itemize}
\item Sequential blocks (SEBs) in two modules can be partitioned in time
\end{itemize}
\begin{center}
\includegraphics[width=0.8\textwidth]{figures/timeDivision.pdf}
\end{center}
\end{frame}
}