# EuroCC@Turkey Parallel Computing on GPUs with CUDA Dr. Özcan DÜLGER Computer Engineering, Middle East Technical University Computer Engineering, Artvin Coruh University 27 April 2022 #### Contents - Debugging and Profiling Performance - Performance Optimization and Efficiency - Some Libraries and Remaining Issues - CUDA Samples #### TESLA K40 | Property | Value | Property | Value | | |---------------------|---------|------------------|--------------|--| | Architecture | Kepler | Global Memory | 11520 MB | | | Number of SMX | 15 | Shared Memory | 49152 Byte | | | CUDA Core | 2880 | L2 Cache | 1572864 Byte | | | Core Clock | 745 MHz | Segment Size | 128 Byte | | | Max. Thread / SMX | 2048 | Warp Size | 32 | | | Max. Thread / Block | 1024 | Max. Block / SMX | 16 | | # Performance Optimization and Efficiency - Memory Coalesced Access to Global Memory - Warp Divergence - Device Occupancy and SM Efficiency - ► Efficient Parallel Reduction Algorithm #### Memory Coalesced Access - ▶ Reading from or writing to global memory performs segment by segment - ► The threads in a warp are physically related to each other. That means a warp completes its instruction when all the threads in the warp complete the instruction - In global memory operations, if the threads in the warp access to the different segments of the global memory, the operations become serial #### None Coalesced Access #### Global Memory 27 April 2022 # **Memory Coalesced Access** Global Memory warp1 warp2 Last warp ``` qlobal void full coalesced access(float *A,float *B,float *C) 3 unsigned int tid = blockDim.x*blockIdx.x+threadIdx.x;//Global thread id Load in one transaction 4 C[tid] = A[tid] + B[tid];//Vector addition 5 } 6 _global__ void non coalesced access(float *A,float *B,float *C,curandInitializer RNGs,unsigned int NP) 8 { 9 unsigned int tid = blockDim.x*blockIdx.x+threadIdx.x://Global thread id 10 11 curandState t state;// State of the generator Load the state of the generator of each thread from 12 RNGs.load(state,tid);// Loading the state _____ 13 unsigned int index,i; global memory as a coalesced way 15 for(i=0;i<100;i++) 16 17 index = curand(&state)/NP;// Generate a random number between 0 and data size-1 Load in at most 32 transactions C[tid] = A[index] + B[index];//Vector addition 19 20 } 22 int main(int argc, char **argv) Example of Memory Coalesced Access 23 { 24 unsigned int data size = 4194304;//Data size float *A host, *B host, *C host ://Host Arrays float *A GPU, *B GPU, *C GPU; // Device Arrays 26 27 28 for(int counter = 0; counter < data size; counter++)</pre> Note: We use cudaEventRecord in order to 29 measure the kernel execution times A host[counter] = counter+1;//Assigning numbers from 1 to size 30 B host[counter] = counter+2;//Assigning numbers from 2 to size+1 31 32 33 34 cudaMemcpy(A GPU,A host,sizeof(float)*data size,cudaMemcpyHostToDevice); cudaMemcpy(B GPU,B host,sizeof(float)*data size,cudaMemcpyHostToDevice); 37 unsigned int NTB = 1024;//Number of threads in a block 38 unsigned int NP data size = (unsigned long int)pow(2,32)/data size;// Number of partitions for 'data size' XORWOW Generators dim3 threadsPerBlock(NTB);//Number of threads in a block 40 dim3 numBlocks(data size/NTB);//Number of blocks in a grid 41 42 curandInitializer RNGs(data size);//Creating generators for each thread in 'non coalesced access' kernel 43 unsigned int clck = clock(); 44 initialize RNGs<<<numBlocks,threadsPerBlock>>>(RNGs,clck);//Initializing the states of each generator 45 full coalesced access<<<numBlocks,threadsPerBlock>>>(A GPU,B GPU,C GPU);//Launching 'full coalesced access' kernel 47 non coalesced access<<<numBlocks,threadsPerBlock>>>(A GPU,B GPU,C GPU,RNGs,NP data size);//Launching 'non coalesced access' kernel 48 cudaDeviceSynchronize()://Waits until the kernel completes its run 49 50 } ``` ``` global void full coalesced access(float *A,float *B,float *C) 2 { unsigned int tid = blockDim.x*blockIdx.x+threadIdx.x;//Global thread id 3 C[tid] = A[tid] + B[tid];//Vector addition 5 } 6 _global__ void non coalesced access(float *A,float *B,float *C,curandInitializer RNGs,unsigned int NP) 8 { unsigned int tid = blockDim.x*blockIdx.x+threadIdx.x;//Global thread id 9 10 11 curandState t state;// State of the generator 12 RNGs.load(state,tid);// Loading the state 13 unsigned int index,i; 14 15 for(i=0;i<100;i++) 16 17 index = curand(&state)/NP;// Generate a random number between 0 and data size-1 18 C[tid] = A[index] + B[index];//Vector addition 19 20 } Metrics: 22 int main(int argc, char **argv) gld transactions: Number of global memory load transactions 23 { gld_transactions_per_request: Average number of global memory load 24 unsigned int data size = 32768; //Data size transactions performed for each global memory load float *A host, *B host, *C host ;//Host Arrays 26 float *A GPII *R GPII *C GPII · / / Device Arrays ``` Exec. Time of 'full\_coalesced\_access' kernel = 9.504e-06 Exec. Time of 'non\_coalesced\_access' kernel = 0.00168371 Speed Up = 177.158X ``` ==11093== Profiling result: ==11093== Metric result: Metric Name Metric Description Min Invocations Avg Max Device "Tesla K40c (0)" Kernel: full_coalesced_access(float*, float*, float*) gld transactions Global Load Transactions 2048 2048 2048 gld_transactions_per_request Global Load Transactions Per Request 1.000000 1.000000 1.000000 Kernel: non_coalesced_access(float*, float*, float*, curandInitializer, unsigned int) gld_transactions Global Load Transactions 6461574 6461574 6461574 1 Global Load Transactions Per Request gld_transactions_per_request 30.631703 30.631703 30.631703 ``` ``` 1 global void full coalesced access(float *A,float *B,float *C) unsigned int tid = blockDim.x*blockIdx.x+threadIdx.x;//Global thread id 3 4 C[tid] = A[tid] + B[tid]://Vector addition 5 } 7 global void non coalesced access(float *A,float *B,float *C,curandInitializer RNGs,unsigned int NP) 8 { unsigned int tid = blockDim.x*blockIdx.x+threadIdx.x;//Global thread id 9 10 11 curandState t state;// State of the generator 12 RNGs.load(state,tid);// Loading the state 13 unsigned int index,i; 14 15 for(i=0;i<100;i++) 16 17 index = curand(&state)/NP;// Generate a random number between 0 and data size-1 18 C[tid] = A[index] + B[index];//Vector addition 20 } 21 22 int main(int argc, char **argv) 23 { unsigned int data size = (4194304;)//Data size 24 float *A host, *B host, *C host // Host Arrays flost *A CDII *P CDII *C CDII //Doutes Arrous Exec. Time of 'full coalesced access' kernel = 0.000290432 Exec. Time of 'non coalesced access' kernel = 0.516614 Speed Up = 1778.78X ``` ``` ==11325== Profiling result: ==11325== Metric result: Invocations Metric Description Min Metric Name Max Avg Device "Tesla K40c (0)" Kernel: full_coalesced_access(float*, float*, float*) gld_transactions Global Load Transactions 1 262144 262144 262144 gld_transactions_per_request Global Load Transactions Per Request 1.000000 1.000000 1.000000 Kernel: non_coalesced_access(float*, float*, float*, curandInitializer, unsigned int) gld_transactions Global Load Transactions 839548174 839548174 839548174 gld transactions per request Global Load Transactions Per Request 31.093419 31.093419 31.093419 ``` # Grouping #### Global Memory - A group consists of contiguous segments - The number of segments in the group can be - between 1 and 32 (warp size) Dülger, Ö., Oğuztüzün, H. & Demirekler, M. Memory Coalescing Implementation of Metropolis Resampling on Graphics Processing Unit. J Sign Process Syst 90, 433-447 (2018) # Grouping Global Memory ``` global void semi coalesced access(float *A,float *B,float *C,curandInitializer RNGs1,curandInitializer RNGs2,unsigned int NPP group count,unsigned int NPP group size,unsigned int 2 { int tid = blockDim.x*blockIdx.x+threadIdx.x;//Global thread id curandState t state1, state2;// States of the generators RNGs1.load(state1,tid);// Loading the state of first generator RNGs2.load(state2,tid);// Loading the state of second generator 9 unsigned int GN = curand(&state2)/NPP group count;//Generate a random number between 0 and group count-1 (Pick a random group) 10 unsigned int index,i; 11 12 for(i=0;i<100;i++) 13 index = (curand(&state1)/NPP group size) + (GN*GS);//Generate a random number between 0 and group size-1 then shift the index (Pick a random data within the selected group) C[tid] = A[index] + B[index];//Vector addition 15 16 17 } 19 int main(int argc, char **argv) 20 { 21 unsigned int data size = 4194304;//Data size float *A host, *B host, *C host ://Host Arrays The number of segments is 16 in a group 23 float *A GPU,*B GPU,*C GPU;//Device Arrays So the memory operations of a warp will perform at 25 for(int counter = 0; counter < data size; counter++)</pre> most 16 transactions 27 A host[counter] = counter+1;//Assigning numbers from 1 to size B host[counter] = counter+2;//Assigning numbers from 2 to size+1 29 31 cudaMemcpy(A GPU,A host,sizeof(float)*data size,cudaMemcpyHostToDevice); cudaMemcpy(B GPU,B host,sizeof(float)*data size,cudaMemcpyHostToDevice); 34 unsigned int NTB = 1024;//Number of threads in a block 35 unsigned int NP data size = (unsigned long int)pow(2,32)/data size;// Number of partitions for 'data size' 37 unsigned int segment size = 128;//Number of bytes of a segment unsigned int group size = 16*(segment size/4);//Number of data in a group unsigned int group count = data size/group size;//Number of groups for 'data size' unsigned int NP group count = (unsigned long int)pow(2,32)/group count;// Number of partitions for 'group count' unsigned int NP group size = (unsigned long int)pow(2,32)/group size;// Number of partitions for 'group size dim3 threadsPerBlock(NTB);//Number of threads in a block dim3 numBlocks(data size/NTB);//Number of blocks in a grid 45 full coalesced access<<<numBlocks,threadsPerBlock>>>(A GPU,B GPU,C GPU);//Launching 'full coalesced access' kernel 46 47 non coalesced access<<<numBlocks,threadsPerBlock>>>(A GPU,B GPU,C GPU,RNGs,NP data size);//Launching 'non coalesced access' kernel cudaDeviceSynchronize();//Waits until the kernel completes its run 49 } 13 EuroCC@Turkey, Parallel Computing on GPUs with CUDA 27 April 2022 ``` ``` global void semi coalesced access(float *A,float *B,float *C,curandInitializer RNGs1,curandInitializer RNGs2,unsigned int NPP group count,unsigned int NPP group size,unsigned int 2 { int tid = blockDim.x*blockIdx.x+threadIdx.x;//Global thread id 4 5 curandState t state1, state2;// States of the generators 6 RNGs1.load(state1,tid);// Loading the state of first generator RNGs2.load(state2,tid);// Loading the state of second generator 8 9 unsigned int GN = curand(&state2)/NPP group count;//Generate a random number between 0 and group count-1 (Pick a random group) 10 unsigned int index,i; 11 12 for(i=0;i<100;i++) 13 14 index = (curand(&statel)/NPP group size) + (GN*GS);//Generate a random number between θ and group size-1 then shift the index (Pick a random data within the selected group) 15 C[tid] = A[index] + B[index];//Vector addition 16 17 } 18 19 int main(int argc, char **argv) 20 { unsigned int data size = 32768;//Data size 21 float *A host, *B host, *C host ;//Host Arrays 41--+ *A CDII *D CDII *C CDI //Barrica Annova ``` ``` Exec. Time of 'full_coalesced_access' kernel = 9.6e-06 Exec. Time of 'semi_coalesced_access' kernel = 0.00078848 Speed Up = 82.1333X Exec. Time of 'non_coalesced_access' kernel = 0.00167981 Speed Up = 174.98X ``` ``` ==16800== Profiling result: ==16800== Metric result: Invocations Metric Description Metric Name Min Max Avg Device "Tesla K40c (0)" Kernel: full coalesced_access(float*, float*, float*) gld transactions Global Load Transactions 2048 2048 2048 Global Load Transactions Per Request 1.000000 gld_transactions_per_request 1.000000 1.000000 Kernel: semi coalesced access(float*, float*, float*, curandInitializer, curandInitializer, unsigned int, unsigned int, unsigned int) Global Load Transactions gld transactions 2870742 2870742 2870742 qld transactions per request Global Load Transactions Per Request 13.413679 13.413679 13.413679 Kernel: non_coalesced_access(float*, float*, float*, curandInitializer, unsigned int) gld transactions Global Load Transactions 6461532 6461532 6461532 gld_transactions_per_request Global Load Transactions Per Request 30.631504 30.631504 30.631504 ``` ``` global void semi coalesced access(float *A,float *B,float *C,curandInitializer RNGs1,curandInitializer RNGs2,unsigned int NPP group count,unsigned int NPP group size,unsigned int GS) 2 { 3 int tid = blockDim.x*blockIdx.x+threadIdx.x;//Global thread id 4 curandState t state1, state2;// States of the generators RNGs1.load(state1,tid);// Loading the state of first generator 6 RNGs2.load(state2,tid);// Loading the state of second generator 8 9 unsigned int GN = curand(&state2)/NPP group count;//Generate a random number between 0 and group count-1 (Pick a random group) 10 unsigned int index,i; 11 12 for(i=0;i<100;i++) 13 index = (curand(&state1)/NPP group size) + (GN*GS);//Generate a random number between 0 and group size-1 then shift the index (Pick a random data within the selected group) C[tid] = A[index] + B[index];//Vector addition 16 17 } 18 19 int main(int argc, char **argv) 20 { 21 unsigned int data size = 4194304; /Data size float *A host, *B host, *C host :// Host Arrays floot *A CDII *P CDII *C CDII //Dovice Arraye ``` ``` Exec. Time of 'full coalesced access' kernel = 0.00029232 Exec. Time of 'semi_coalesced access' kernel = 0.22142 Speed Up = 757.457X Exec. Time of 'non coalesced access' kernel = 0.516598 Speed Up = 1767.24X ``` ``` ==16562== Profiling result: ==16562== Metric result: Invocations Metric Name Metric Description Min Max Avg Device "Tesla K40c (0)" Kernel: full coalesced access(float*, float*, float*) Global Load Transactions gld transactions 262144 262144 262144 gld transactions per request Global Load Transactions Per Request 1.000000 1.000000 1.000000 Kernel: semi_coalesced_access(float*, float*, float*, curandInitializer, curandInitializer, unsigned int, unsigned int, unsigned int) gld transactions Global Load Transactions 367433454 367433454 367433454 gld_transactions_per_request Global Load Transactions Per Request 13.412894 13.412894 13.412894 Kernel: non_coalesced_access(float*, float*, float*, curandInitializer, unsigned int) Global Load Transactions gld_transactions 839547696 839547696 839547696 gld_transactions_per_request Global Load Transactions Per Request 31.093401 31.093401 31.093401 ``` 5 14 15 22 ### Warp Divergence - Some of the structures such as 'If-Else' structure are considered as a single instruction for a warp - A warp completes such instructions when all the threads in the warp complete those instructions - If the threads in a warp execute the different paths of 'If-Else' structure, executing these paths become serial - if(tid %2 == 0)//tid is global thread id ......else First the threads with even thread id in a warp execute 'if' path, and the remaining - threads wait - ► Then the threads with odd thread id in a warp execute the 'else' path, and the remaining threads wait . . . . . . . . . . . . . ### Warp Divergence - It is important that all the threads in a warp execute the same path of the 'if-Else' structure - ▶ This can be ensured by using warp id in the condition of the structure - $\rightarrow$ if( (tid/32) %2 == 0)//tid is the global thread id ``` else ...... ``` - ► The threads in a warp whose warp id is even execute the 'if' path - ▶ The threads in a warp whose warp id is odd execute the 'else' path - So executing the paths do not become serialized ``` unsigned int tid = blockDim.x*blockIdx.x+threadIdx.x;//Global thread id for(unsigned int i=0;i<100;i++)</pre> if((tid/32) % 4 == 0) C[tid] = A[tid] + B[tid];//Vector addition else if ((tid/32) % 4 == 1) C[tid] = A[tid] - B[tid];//Vector subtraction Distribute the paths according to warp id else if( (tid/32) % 4 == 2) First warp executes addition, second warp executes C[tid] = A[tid] * B[tid];//Vector multiplication subtraction and so on else if( (tid/32) % 4 == 3) C[tid] = A[tid] / B[tid];//Vector division global void warp divergence(float *A, float *B, float *C)//Four different paths for the warps in 'if-elseif' structure unsigned int tid = blockDim.x*blockIdx.x+threadIdx.x;//Global thread id for (unsigned int i=0; i<100; i++) if(tid % 4 == 0) Distribute the paths according to thread id C[tid] = A[tid] + B[tid];//Vector addition else if( tid % 4 == 1) First thread executes addition, second thread C[tid] = A[tid] - B[tid];//Vector subtraction executes subtraction and so on else if ( tid % 4 == 2) C[tid] = A[tid] * B[tid];//Vector multiplication else if( tid % 4 == 3) C[tid] = A[tid] / B[tid];//Vector division • Sufficiently number of paths (4) 35 int main(int argc, char **argv) • Sufficiently number of repetitions of the instruction (100) unsigned int data size = 4194304;//Data size • Memory operations are coalesced, so the divergence float *A host, *B host, *C host ;//Host Arrays dominates the execution time float *A GPU, *B GPU, *C GPU; // Device Arrays We use cudaEventRecord in order to measure the kernel cudaMemcpy(A GPU,A host,sizeof(float)*data size,cudaMemcpyHostToDevice); execution times cudaMemcpy(B GPU,B host,sizeof(float)*data size,cudaMemcpyHostToDevice); unsigned int NTB = 1024;//Number of threads in a block dim3 threadsPerBlock(NTB);//Number of threads in a block dim3 numBlocks(data size/NTB);//Number of blocks in a grid 18 warp no divergence<<<numBlocks,threadsPerBlock>>>(A GPU,B GPU,C GPU);//Launching 'warp no divergence' kernel warp divergence<<<numBlocks,threadsPerBlock>>>(A GPU, B GPU, C GPU);//Launching 'warp divergence' kernel cudaDeviceSynchronize();//Waits until vector add kernel completes its run ``` global void warp no divergence(float \*A,float \*B,float \*C)//No branching for the warps in 'if-elseif' structure 3 4 5 6 15 16 } 18 19 { 20 33 } 36 { 41 47 ``` 3 unsigned int tid = blockDim.x*blockIdx.x+threadIdx.x;//Global thread id 5 for(unsigned int i=0;i<100;i++)</pre> 6 if((tid/32) % 4 == 0) C[tid] = A[tid] + B[tid];//Vector addition else if( (tid/32) % 4 == 1) C[tid] = A[tid] - B[tid];//Vector subtraction else if( (tid/32) % 4 == 2) C[tid] = A[tid] * B[tid];//Vector multiplication else if ( (tid/32) % 4 == 3) C[tid] = A[tid] / B[tid];//Vector division 15 16 } 18 global void warp divergence(float *A,float *B,float *C)//Four different paths for the warps in 'if-elseif' structure 20 unsigned int tid = blockDim.x*blockIdx.x+threadIdx.x;//Global thread id for (unsigned int i=0; i<100; i++) 23 24 if(tid % 4 == 0) C[tid] = A[tid] + B[tid];//Vector addition else if( tid % 4 == 1) Exec. Time of 'warp no divergence' kernel = 0.0124316 C[tid] = A[tid] - B[tid];//Vector subtraction Exec. Time of 'warp divergence' kernel = 0.0385476 else if( tid % 4 == 2) C[tid] = A[tid] * B[tid];//Vector multiplication Speed Up = 3.10078X else if ( tid % 4 == 3) 31 C[tid] = A[tid] / B[tid];//Vector division ==23306== Profiling result: 33 } ==23306== Metric result: Invocations Metric Name Metric Description 35 int main(int argc, char **argv) Min Max Avg Device "Tesla K40c (0)" Kernel: warp_divergence(float*, float*) unsigned int data size = 4194304; //Data size float *A_host, *B_host, *C_host :// Host Arrays warp execution efficiency Warp Execution Efficiency 34.80% 34.80% 34.80% Kernel: warp_no_divergence(float*, float*, float*) float *A CDII *P CDII *C CDII. //Doutes Arrays Warp Execution Efficiency warp execution efficiency 100.00% 100.00% 100.00% ``` #### Metric: warp\_execution\_efficiency: Ratio of the average active threads per warp to the maximum number of threads per warp supported on a multiprocessor expressed as percentage global void warp no divergence(float \*A,float \*B,float \*C)//No branching for the warps in 'if-elseif' structure #### Occupancy - is the ratio of active warps to the maximum number of resident warps supported on a multiprocessor - is related with resource limitations of the SMX. These limitations are: - ► Maximum number of threads per multiprocessor (2048) - Maximum number of threads per block (1024) - Maximum number of blocks per multiprocessor (16) - Shared memory and registers - --ptxas-options=-v gives us the shared memory and register usage - ► The main target is to find the optimum number of threads in a block in order to achieve maximum occupancy ### Occupancy - Set the block size as 64: - ► At most 16x64 (1024) threads can be active in a SMX - ▶ %50 theoretical occupancy - Set the block size as 128: - ► At most 16x128 (2048) threads can be active in a SMX - ▶ %100 theoretical occupancy - Set the block size as 1024: - ► At most 2x1024 (2048) threads can be active in a SMX - %100 theoretical occupancy ``` global void vector add(float *A,float *B,float *C) 2 { int tid = blockDim.x*blockIdx.x+threadIdx.x;//Global thread id for(int i=0;i<1000000;i++) 6 if((tid/32) % 4 == 0) C[tid] = A[tid] + B[tid];//Vector addition else if ( (tid/32) % 4 == 1) C[tid] = A[tid] - B[tid];//Vector subtraction else if( (tid/32) % 4 == 2) C[tid] = A[tid] * B[tid];//Vector multiplication else if ( (tid/32) % 4 == 3) C[tid] = A[tid] / B[tid];//Vector division 14 15 } 17 int main(int argc, char **argv) 18 { int data size;//Data size float *A host, *B host, *C host; // Host Arrays float *A GPU, *B GPU, *C GPU; // Device Arrays int NTB;//Number of threads per block if(data size <= 1024)//Scenario 1 - Set NTB to data size until 2^:</pre> NTB = data size; else NTB = 1024; dim3 threadsPerBlockS1(NTB);//Number of threads in a block dim3 numBlocksS1(data size/NTB);//Number of blocks in a grid vector add<<<numBlocksS1,threadsPerBlockS1>>>(A GPU,B GPU,C GPU); if(data size <= 512)//Scenario 2 - Increase NTB to its double</pre> NTB = 32; else if(data size == 1024) NTB = 64: else if(data size == 2048) NTB = 128; else if(data size == 4096) NTB = 256; else if(data size == 8192) NTB = 512: else NTB = 1024; 45 dim3 threadsPerBlockS2(NTB);//Number of threads in a block 46 47 dim3 numBlocksS2(data size/NTB);//Number of blocks in a grid 49 vector add<<<numBlocksS2,threadsPerBlockS2>>>(A GPU,B GPU,C GPU); ``` - We set the number of iterations as 1000000 so that 'if-elseif' structure dominates the execution time - No warp divergence is occurred - We use cudaEventRecord in order to measure the kernel execution times In scenario 1, we set the number of threads as data\_size until data\_size is 2048 In scenario 2, we set the number of threads as 32 until data\_size is 1024. Then we double the number of threads until data\_size is 16384 #### Occupancy | data_size | Scenario 1 | | | Scenario 2 | | | | |-----------|-----------------|----------------|-----------|--------------|----------------|--------------|--| | | # of<br>threads | # of<br>blocks | T. Occup. | # of threads | # of<br>blocks | T.<br>Occup. | | | 32 | 32 | 1 | %25 | 32 | 1 | %25 | | | 64 | 64 | 1 | %50 | 32 | 2 | %25 | | | 128 | 128 | 1 | %100 | 32 | 4 | %25 | | | 256 | 256 | 1 | %100 | 32 | 8 | %25 | | | 512 | 512 | 1 | %100 | 32 | 16 | %25 | | | 1024 | 1024 | 1 | %100 | 64 | 16 | %50 | | | 2048 | 1024 | 2 | %100 | 128 | 16 | %100 | | | 4096 | 1024 | 4 | %100 | 256 | 16 | %100 | | | 8192 | 1024 | 8 | %100 | 512 | 16 | %100 | | | 16384 | 1024 | 16 | %100 | 1024 | 16 | %100 | | | 32768 | 1024 | 32 | %100 | 1024 | 32 | %100 | | | 65536 | 1024 | 64 | %100 | 1024 | 64 | %100 | | - ▶ Increasing theoretical occupancy does not always mean better time performance - Utilization of streaming multiprocessors efficiently is also an important issue for the better time performance - In the second scenario, we try to distribute the blocks to the SMs evenly ### **SM Efficiency** - sm\_efficiency metric: The percentage of time at least one warp is active on a multiprocessor averaged over all multiprocessors on the GPU - ► The ratios of the running time of each SM to the total running time of the GPU is calculated. The average of these ratios is the result of the metric - achieved\_occupancy metric: the ratio of the average active warps per active cycle to the maximum number of warps supported on a multiprocessor - achieved\_occupancy can not exceed the theoretical occupancy | Data | <b>S1</b> | <b>S2</b> | <b>S1</b> | <b>S2</b> | <b>S1</b> | <b>S2</b> | <b>S1</b> | <b>S2</b> | <b>S1</b> | <b>S2</b> | <b>S1</b> | <b>S2</b> | |-------|-------------------|-------------------|-----------------|-----------------|---------------------|---------------------|-------------------|-------------------|-------------------|-------------------|-------------------|-------------------| | Size | # of th | nreads | # of b | locks | SM Effi | iciency | T. Oc | ccup. | Achieve | ed Occ. | Exec. | Time | | 32 | <mark>32</mark> | <mark>32</mark> | 1 | 1 | <mark>6.54%</mark> | <mark>6.55%</mark> | <mark>25%</mark> | <mark>25%</mark> | <mark>1.5%</mark> | <mark>1.5%</mark> | <mark>0.35</mark> | <mark>0.35</mark> | | 64 | 64 | 32 | 1 | 2 | 6.54% | 12.46% | <b>50</b> % | 25% | 2.9% | 1.5% | 0.41 | 0.41 | | 128 | 128 | 32 | 1 | 4 | 6.54% | 18.83% | 100% | 25% | 4.3% | 1.5% | 0.69 | 0.69 | | 256 | 256 | 32 | 1 | 8 | 6.54% | 37.17% | 100% | 25% | 8.8% | 1.5% | 0.70 | <mark>0.69</mark> | | 512 | 512 | 32 | 1 | 16 | 6.54% | 68.48% | 100% | 25% | <b>17%</b> | 1.6% | 0.71 | <mark>0.69</mark> | | 1024 | 1024 | 64 | 1 | 16 | 6.54% | 78.32% | <mark>100%</mark> | 50% | <mark>35%</mark> | 2.8% | 0.74 | <mark>0.70</mark> | | 2048 | 1024 | 128 | 2 | 16 | 13.15% | 96.32% | 100% | 100% | <mark>35%</mark> | 4.7% | 0.75 | <mark>0.70</mark> | | 4096 | 1024 | 256 | 4 | 16 | 26.26% | 95.91% | 100% | 100% | <mark>35%</mark> | 9.4% | 0.75 | <mark>0.72</mark> | | 8192 | 1024 | 512 | 8 | 16 | 52.41% | 95.10% | 100% | 100% | <mark>35%</mark> | 18% | 0.75 | <mark>0.74</mark> | | 16384 | <mark>1024</mark> | <mark>1024</mark> | <mark>16</mark> | <mark>16</mark> | 83.29% | <b>83.28</b> % | <mark>100%</mark> | <mark>100%</mark> | <mark>39%</mark> | <mark>39%</mark> | <mark>0.91</mark> | <mark>0.91</mark> | | 32768 | <mark>1024</mark> | <mark>1024</mark> | <mark>32</mark> | <mark>32</mark> | <mark>62.07%</mark> | <mark>62.11%</mark> | <mark>100%</mark> | <mark>100%</mark> | <mark>72%</mark> | <mark>72%</mark> | <mark>1.6</mark> | <mark>1.6</mark> | | 65536 | <mark>1024</mark> | <mark>1024</mark> | <mark>64</mark> | <mark>64</mark> | <mark>72.80%</mark> | <mark>72.80%</mark> | <mark>100%</mark> | <mark>100%</mark> | <mark>77%</mark> | <mark>75%</mark> | <mark>2.49</mark> | <mark>2.49</mark> | - ▶ Values with green background mean the scenario is better than the other scenario for the corresponding output - Values with yellow background mean both scenarios have the same values of parameters. Hence the values of the outputs are almost same 25 - ▶ Having better theoretical and achieved occupancy does not always mean better time performance - In this example, SM efficiency is more effective on the execution time of the kernel - Even though S1 has better occupancy, the execution times of S2 are better than those in S1 in some cases ### Causes of Low Achieved Occupancy - Unbalanced workload within blocks - ▶ the warps in a block have unbalanced workload - 2. Unbalanced workload across blocks - the blocks in a grid have unbalanced workload - 3. Too few blocks launched - running few blocks in an SM than the maximum active blocks per SM - 4. Partial last wave - maximum number of warps that can be active at once in an SM - https://docs.nvidia.com/gameworks/content/developertools/desktop/analysis/report/cudaexperiments/kernellevel/achievedoccupancy.htm # Optimizing Parallel Reduction in CUDA Mark Harris NVIDIA Developer Technology https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf #### Parallel Reduction Tree based approach is used for each thread block - We need too many blocks when: - the array size is very large - we want to keep all SMs on the GPU busy - Each block does reduction over each portion of the array and produces a single output - ► How do we combine the output of each block? ### **Global Synchronization** - Once the partial outputs of the blocks are produced, a synchronization mechanism between the blocks is needed in order to combine them. - ► The blocks must wait in a synchronization point until all the blocks complete producing their outputs. Then recursive processes should follow to obtain overall output. - In CUDA, the synchronization between the threads in the same block is possible. - ► However, the threads in different blocks can not be synchronized each other. - ▶ Because the synchronization of the blocks is expensive for the GPUs whose processor counts is large - ▶ In fact, CUDA forces the programmers to create fewer blocks which may reduce the overall efficiency - Using multiple kernels is a good solution. There is an implicit barrier between the kernel launches. The next kernel can not be executed before the current kernel finishes its execution. - By the way, kernel launch has low overhead ### Multiple Kernels and The Goal - ▶ The code of both kernels is same. So we can launch them recursively. - ▶ Since the reduction has low arithmetic cost, we should try to achieve high bandwidth. - NVIDIA G80 is used in this experiment. - ▶ 86GB/s memory bandwidth #### Reduction 1: Interleaved Addressing ``` _global__ void reduce0(int *g_idata, int *g_odata) { extern shared int sdata[]; // each thread loads one element from global to shared mem unsigned int tid = threadldx.x; ______ Local thread id unsigned int i = blockldx.x*blockDim.x + threadIdx.x; ————— Global thread id sdata[tid] = g_idata[i]; __syncthreads(); ———————— Threads in the same block synchronize here // do reduction in shared mem for(unsigned int s=1; s < blockDim.x; s *= 2) { if (tid % (2*s) == 0) { Threads whose local id is 0 or multiple of 2*s sdata[tid] += sdata[tid + s]; __syncthreads(); // write result for this block to global mem if (tid == 0) g_odata[blockldx.x] = sdata[0]; ``` ### Interleaved Addressing 1 #### Reduction 1: Interleaved Addressing ``` _global__ void reduce1(int *g_idata, int *g_odata) { extern __shared__ int sdata[]; // each thread loads one element from global to shared mem unsigned int tid = threadldx.x; unsigned int i = blockldx.x*blockDim.x + threadldx.x; sdata[tid] = g_idata[i]; __syncthreads(); // do reduction in shared mem for (unsigned int s=1; s < blockDim.x; s *= 2) { if (tid % (2*s) == 0) { Problem: highly divergent sdata[tid] += sdata[tid + s]; warps are very inefficient, and % operator is very slow __syncthreads(); // write result for this block to global mem if (tid == 0) g_odata[blockldx.x] = sdata[0]; Ref: Mark Harris - https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf ``` #### Performance for 4M element reduction | Ti | me (2 <sup>22</sup> ints) | Bandwidth | | | |-----------------------------------------------------------|---------------------------|------------|--|--| | Kernel 1: interleaved addressing with divergent branching | 8.054 ms | 2.083 GB/s | | | The block size is 128 threads for all kernel launches #### Reduction 2: Interleaved Addressing #### Just replace divergent branch in inner loop: ``` for (unsigned int s=1; s < blockDim.x; s *= 2) { if (tid % (2*s) == 0) { sdata[tid] += sdata[tid + s]; } __syncthreads(); }</pre> ``` #### With strided index and non-divergent branch: ``` for (unsigned int s=1; s < blockDim.x; s *= 2) { int index = 2 * s * tid; if (index < blockDim.x) { sdata[index] += sdata[index + s]; } __syncthreads(); }</pre> ``` ### Interleaved Addressing 2 | | Time (2 <sup>22</sup> ints) | Bandwidth | Step<br>Speedup | Cumulative<br>Speedup | |-----------------------------------------------------------------|-----------------------------|------------|-----------------|-----------------------| | Kernel 1:<br>interleaved addressing<br>with divergent branching | 8.054 ms | 2.083 GB/s | | | | Kernel 2: interleaved addressing with bank conflicts | 3.456 ms | 4.854 GB/s | 2.33x | 2.33x | ### Sequential Addressing # Reduction 3: Sequential Addressing #### Just replace strided indexing in inner loop: ``` for (unsigned int s=1; s < blockDim.x; s *= 2) { int index = 2 * s * tid; if (index < blockDim.x) { sdata[index] += sdata[index + s]; } __syncthreads(); }</pre> ``` #### With reversed loop and threadID-based indexing: ``` for (unsigned int s=blockDim.x/2; s>0; s>>=1) { if (tid < s) { sdata[tid] += sdata[tid + s]; } __syncthreads(); }</pre> ``` | | Time (2 <sup>22</sup> ints) | Bandwidth | Step<br>Speedup | Cumulative<br>Speedup | |-----------------------------------------------------------|-----------------------------|------------|-----------------|-----------------------| | Kernel 1: interleaved addressing with divergent branching | 8.054 ms | 2.083 GB/s | | | | Kernel 2: interleaved addressing with bank conflicts | 3.456 ms | 4.854 GB/s | 2.33x | 2.33x | | Kernel 3: sequential addressing | 1.722 ms | 9.741 GB/s | 2.01x | 4.68x | #### Idle Threads ``` for (unsigned int s=blockDim.x/2; s>0; s>>=1) { if (tid < s) { sdata[tid] += sdata[tid + s]; } __syncthreads(); }</pre> ``` ▶ Half of the threads are idle in the first iteration of the loop. Wasteful! # Reduction 4: First Add During Load #### Halve the number of blocks, and replace single load: ``` // each thread loads one element from global to shared mem unsigned int tid = threadldx.x; unsigned int i = blockldx.x*blockDim.x + threadldx.x; sdata[tid] = g_idata[i]; __syncthreads(); ``` #### With two loads and first add of the reduction: ``` // perform first level of reduction, // reading from global memory, writing to shared memory unsigned int tid = threadldx.x; unsigned int i = blockldx.x*(blockDim.x*2) + threadldx.x; sdata[tid] = g_idata[i] + g_idata[i+blockDim.x]; __syncthreads(); ``` | | Time (2 <sup>22</sup> ints) | Bandwidth | Step<br>Speedup | Cumulative<br>Speedup | |-----------------------------------------------------------|-----------------------------|-------------|-----------------|-----------------------| | Kernel 1: interleaved addressing with divergent branching | 8.054 ms | 2.083 GB/s | | | | Kernel 2: interleaved addressing with bank conflicts | 3.456 ms | 4.854 GB/s | 2.33x | 2.33x | | Kernel 3: sequential addressing | 1.722 ms | 9.741 GB/s | 2.01x | 4.68x | | Kernel 4: first add during global load | 0.965 ms | 17.377 GB/s | 1.78x | 8.34x | # Unrolling the Last Warp - ▶ 17 GB/s is very low - Loop overhead occurs! - The number of the active threads reduces at each iteration of the loop - When s < 32, only one warp is active!</p> - Since the threads in a warp execute the same instruction at a time, we do not need to synchronize them and do not need \_\_syncthreads() function - ▶ No need to evaluate last 6 iterations of the loop for the remaining warps - So one can unroll the last 6 iterations of the loop # Reduction 5: Unroll the Last Warp ``` _device__ void warpReduce(volatile int* sdata, int tid) { sdata[tid] += sdata[tid + 32]; ^ sdata[tid] += sdata[tid + 16]; IMPORTANT: sdata[tid] += sdata[tid + 8]; For this to be correct, sdata[tid] += sdata[tid + 4]; we must use the sdata[tid] += sdata[tid + 2]; "volatile" keyword! sdata[tid] += sdata[tid + 1]; // later... for (unsigned int s=blockDim.x/2; s>32; s>>=1) { if (tid < s) sdata[tid] += sdata[tid + s]; syncthreads(); ``` Ref: Mark Harris - https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf if (tid < 32) warpReduce(sdata, tid);</pre> Disabling optimizations (such as caching) | | Time (2 <sup>22</sup> ints) | Bandwidth | Step<br>Speedup | Cumulative<br>Speedup | |-----------------------------------------------------------|-----------------------------|-------------|-----------------|-----------------------| | Kernel 1: interleaved addressing with divergent branching | 8.054 ms | 2.083 GB/s | | | | Kernel 2: interleaved addressing with bank conflicts | 3.456 ms | 4.854 GB/s | 2.33x | 2.33x | | Kernel 3: sequential addressing | 1.722 ms | 9.741 GB/s | 2.01x | 4.68x | | Kernel 4: first add during global load | 0.965 ms | 17.377 GB/s | 1.78x | 8.34x | | Kernel 5:<br>unroll last warp | 0.536 ms | 31.289 GB/s | 1.8x | 15.01x | # Complete Unrolling - Can we unroll all of the iterations of the loop? - ▶ Since the possible values of the block size are fixed and known at compile time and there is an upper limit on it, one can completely unroll the iterations of the loop. - ▶ Using C++ templates enables us to do it template <unsigned int blockSize> \_\_global\_\_ void reduce5(int \*g\_idata, int \*g\_odata) Function template parameter ## Completely Unrolled ``` Template <unsigned int blockSize> __device__ void warpReduce(volatile int* sdata, int tid) { if (blockSize >= 64) sdata[tid] += sdata[tid + 32]; if (blockSize >= 32) sdata[tid] += sdata[tid + 16]; if (blockSize >= 16) sdata[tid] += sdata[tid + 8]; if (blockSize >= 8) sdata[tid] += sdata[tid + 4]; if (blockSize >= 4) sdata[tid] += sdata[tid + 2]; if (blockSize >= 2) sdata[tid] += sdata[tid + 1]; } ``` ``` if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); } if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); } if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); } if (tid < 32) warpReduce<blockSize>(sdata, tid); ``` - The maximum block size is 1024 in newer GPUs! - Reds are evaluated at compile time # **Invoking Template Kernels** ``` switch (threads) case 512: reduce5<512><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata); break; case 256: reduce5<256><< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata); break; case 128: reduce5<128><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata); break; case 64: reduce5< 64><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata); break; case 32: reduce5< 32><< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata); break; case 16: reduce5< 16><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata); break; case 8: reduce5< 8><< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata); break; case 4: reduce5< 4><< dimGrid, dimBlock, smemSize >>>(d idata, d odata); break; case 2: reduce5< 2><< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata); break; case 1: reduce5< 1><< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata); break; ``` ► 10 + 1 (1024) = 11 possible block sizes | | Time (2 <sup>22</sup> ints) | Bandwidth | Step<br>Speedup | Cumulative<br>Speedup | |------------------------------------------------------------|-----------------------------|-------------|-----------------|-----------------------| | Kernel 1: interleaved addressing with divergent branching | 8.054 ms | 2.083 GB/s | | | | Kernel 2:<br>interleaved addressing<br>with bank conflicts | 3.456 ms | 4.854 GB/s | 2.33x | 2.33x | | Kernel 3: sequential addressing | 1.722 ms | 9.741 GB/s | 2.01x | 4.68x | | Kernel 4: first add during global load | 0.965 ms | 17.377 GB/s | 1.78x | 8.34x | | Kernel 5:<br>unroll last warp | 0.536 ms | 31.289 GB/s | 1.8x | 15.01x | | Kernel 6: completely unrolled | 0.381 ms | 43.996 GB/s | 1.41x | 21.16x | # Cost and Algorithm Cascading - Step Complexity: O(log N) - Work Complexity: O(N) - Time Complexity: O(log N) - Processor Time Complexity: O(Nlog N) - Algorithm Cascading: Combining sequential and parallel reduction - Each thread performs loading and summing multiple elements sequentially and store the result to the shared memory - Tree based reduction performs through shared memory in parallel ### Reduction 7: Multiple Adds / Thread #### Replace load and add of two elements: ``` unsigned int tid = threadldx.x; unsigned int i = blockldx.x*(blockDim.x*2) + threadldx.x; sdata[tid] = g_idata[i] + g_idata[i+blockDim.x]; __syncthreads(); ``` #### With a while loop to add as many as necessary: ``` unsigned int tid = threadldx.x; Decreasing the number of blocks means increasing unsigned int i = blockldx.x*(blockSize*2) + threadldx.x; the number of iterations unsigned int gridSize = blockSize*2*gridDim.x; of 'while' loop sdata[tid] = 0: while (i < n) { A balance between the sdata[tid] += g_idata[i] + g_idata[i+blockSize]; level of the tree and the i += gridSize; iteration of the loop should be maintained _syncthreads(); Maintain coalescing ``` | | Time (2 <sup>22</sup> ints) | Bandwidth | Step<br>Speedup | Cumulative<br>Speedup | |-----------------------------------------------------------------|-----------------------------|-------------|-----------------|-----------------------| | Kernel 1:<br>interleaved addressing<br>with divergent branching | 8.054 ms | 2.083 GB/s | | | | Kernel 2:<br>interleaved addressing<br>with bank conflicts | 3.456 ms | 4.854 GB/s | 2.33x | 2.33x | | Kernel 3: sequential addressing | 1.722 ms | 9.741 GB/s | 2.01x | 4.68x | | Kernel 4: first add during global load | 0.965 ms | 17.377 GB/s | 1.78x | 8.34x | | Kernel 5:<br>unroll last warp | 0.536 ms | 31.289 GB/s | 1.8x | 15.01x | | Kernel 6: completely unrolled | 0.381 ms | 43.996 GB/s | 1.41x | 21.16x | | Kernel 7:<br>multiple elements per thread | 0.268 ms | 62.671 GB/s | 1.42x | 30.04x | ## Final Optimized Kernel ``` template <unsigned int blockSize> device void warpReduce(volatile int *sdata, unsigned int tid) { if (blockSize >= 64) sdata[tid] += sdata[tid + 32]; if (blockSize >= 32) sdata[tid] += sdata[tid + 16]; if (blockSize >= 16) sdata[tid] += sdata[tid + 8]; if (blockSize >= 8) sdata[tid] += sdata[tid + 4]; if (blockSize >= 4) sdata[tid] += sdata[tid + 2]; if (blockSize >= 2) sdata[tid] += sdata[tid + 1]; template <unsigned int blockSize> __global__ void reduce6(int *g_idata, int *g_odata, unsigned int n) { extern __shared__ int sdata[]; unsigned int tid = threadldx.x; unsigned int i = blockldx.x*(blockSize*2) + tid; unsigned int gridSize = blockSize*2*gridDim.x; sdata[tid] = 0; while (i < n) { sdata[tid] += g_idata[i] + g_idata[i+blockSize]; i += gridSize; } __syncthreads(); if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); } if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); } if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } syncthreads(); } if (tid < 32) warpReduce(sdata, tid);</pre> if (tid == 0) g_odata[blockldx.x] = sdata[0]; ``` # Performance Comparison #### References - https://www.nvidia.com/content/dam/en-zz/Solutions/Data-Center/tesla-product-literature/NVIDIA-Kepler-GK110-GK210-Architecture-Whitepaper.pdf - https://docs.nvidia.com/cuda/pdf/CURAND\_Library.pdf - https://docs.nvidia.com/cuda/cuda-runtime-api/group\_\_CUDART\_\_EVENT.html - https://docs.nvidia.com/cuda/profiler-users-guide - Dülger, Ö., Oğuztüzün, H. & Demirekler, M. Memory Coalescing Implementation of Metropolis Resampling on Graphics Processing Unit. J Sign Process Syst 90, 433-447 (2018) <a href="https://rdcu.be/cLz8N">https://rdcu.be/cLz8N</a> - https://docs.nvidia.com/cuda/cuda-occupancy-calculator/index.html - https://docs.nvidia.com/gameworks/content/developertools/desktop/analysis/ report/cudaexperiments/kernellevel/achievedoccupancy.htm - https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf