CUDA

GP-what?

prezentacia

Hints

Pre-requisities

Jadro

Code

#include <cuda.h> #include <cuda_runtime.h> #include <stdio.h> void probe(){ int rv,devices; rv = cudaGetDeviceCount(&devices); printf("[%d] Devices: %d\n",rv,devices); } int main(){ probe(); }

Kompilacia

nvcc -arch=sm_30 -std=c++11 -x cu main.cpp -o main
alebo
nvcc -D_FORCE_INLINES -m64 -O3 -arch=sm_30 -std=c++11 --compiler-options -fno-strict-aliasing -use_fast_math --ptxas-options=-v -x cu main.cpp -o main

Exec

./main [38] Devices: 0
curun req 2 ./main [0] Devices: 2

Zariadenia

V systeme moze byt naraz viac CUDA zariadeni.

Code

#include <cuda.h> #include <cuda_runtime.h> #include <stdio.h> void probe(){ int rv,devices; cudaDeviceProp prop; rv = cudaGetDeviceCount(&devices); printf("[%d] Devices: %d\n",rv,devices); for(int i=0;i<devices;i++){ cudaGetDeviceProperties(&prop, i); printf("[%i] Name: %s\n",i,prop.name); printf(" PCI : %d:%d:%d\n",prop.pciBusID,prop.pciDomainID,prop.pciDeviceID); printf(" Ver : %d.%d\n",prop.major,prop.minor); printf(" Freq: %.2f MHz\n",prop.clockRate/1000.0); printf(" MEM : %lu MB\n",prop.totalGlobalMem/(1024*1024)); } } int main(){ probe(); }

./main [0] Devices: 2 [0] Name: GeForce GTX TITAN Black PCI : 5:0:0 Ver : 3.5 Freq: 980.00 MHz MEM : 6079 MB [1] Name: Quadro K4000 PCI : 6:0:0 Ver : 3.0 Freq: 810.50 MHz MEM : 3014 MB

Prvy program - Vektor * skalar

Code

#include <cuda.h> #include <cuda_runtime.h> #include <stdio.h> #include <time.h> #include <stdlib.h> #include <cmath> #include <algorithm> #include <chrono> typedef unsigned int uint; #define DEVICE 0 #ifdef DOUBLE_PRECISION #define FLOATING double #define ABS fabs #define SIN sin #define COS cos #define SICO sincos #else #define FLOATING float #define ABS fabsf #define SIN sinf #define COS cosf #define SICO sincosf #endif #define SOF sizeof(FLOATING) #define H2D cudaMemcpyHostToDevice #define D2D cudaMemcpyDeviceToDevice #define D2H cudaMemcpyDeviceToHost #define CUDAEXITONERROR(s) {cudaDeviceSynchronize();cudaError err;if ((err=cudaGetLastError()) != cudaSuccess) {printf("CUDA error:%s\n%s", s,cudaGetErrorString(err));return;}} using namespace std; double diff(FLOATING *a,FLOATING *b,uint size){ FLOATING x,iv; double rv=0; volatile FLOATING tmp; for(uint i=0;i<size;i++){ x = a[i]; tmp=x; if ((x!=tmp)|| (tmp==x && (tmp-x)!=0)) tmp=0; iv=tmp; x = b[i]; tmp=x; if ((x!=tmp)|| (tmp==x && (tmp-x)!=0)) tmp=0; iv-=tmp; rv += ABS(iv); } return rv; } class Ticker{ private: FLOATING elapsed_ms; chrono::steady_clock::time_point tp_prev; chrono::steady_clock::time_point tp_now; public: Ticker():tp_prev(chrono::steady_clock::now()){}; void tick(const char* txt=nullptr){ tp_now = chrono::steady_clock::now(); if (txt != nullptr){ elapsed_ms = chrono::duration_cast<chrono::microseconds>(tp_now - tp_prev).count()/1000.0; printf("[I][%8.2fms]:%s\n",elapsed_ms,txt); } tp_prev = tp_now; } }; __global__ void kernel_mul(FLOATING *src,FLOATING *tgt,uint size,FLOATING factor){ const uint numThreads = blockDim.x * gridDim.x; for (uint i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += numThreads){ tgt[i] = src[i]*factor; } } void d_mul(FLOATING *h_src,FLOATING fac,FLOATING *h_tgt,uint size){ cudaSetDevice(DEVICE); FLOATING *d_src,*d_tgt; Ticker tick; cudaMalloc((void**)&d_src, SOF*size); cudaMalloc((void**)&d_tgt, SOF*size); cudaMemcpy(d_src,h_src,size*SOF,H2D); tick.tick(); kernel_mul<<<512,512>>>(d_src,d_tgt,size,fac); cudaDeviceSynchronize(); tick.tick("Exec"); cudaMemcpy(h_tgt,d_tgt,size*SOF,D2H); cudaFree(d_src); cudaFree(d_tgt); } void mul(){ Ticker tick; uint size=64*1024*1024; FLOATING *a,*dev,*host,fac,rv; fac = 13.37; a = new FLOATING[size]; dev = new FLOATING[size]; host = new FLOATING[size]; srand(time(nullptr)); tick.tick(); for(uint i=0;i<size;i++) a[i]=(rand()%87355)/87355.0; tick.tick("Gen"); for(uint i=0;i<size;i++) host[i]=a[i]*fac; tick.tick("Host"); d_mul(a,fac,dev,size); tick.tick("CUDA"); rv = diff(host,dev,size); tick.tick("Check"); printf("Diff : %f | %e\n",rv,rv/size); delete a; delete host; delete dev; } int main(){ mul(); }

./main [I][ 1327.04ms]:Gen [I][ 425.36ms]:Host [I][ 2.84ms]:Exec [I][ 576.39ms]:CUDA [I][ 1513.19ms]:Check Diff : 0.000000 | 0.000000e+00
-DDOUBLE_PRECISION
./main [I][ 1302.76ms]:Gen [I][ 502.15ms]:Host [I][ 5.05ms]:Exec [I][ 841.00ms]:CUDA [I][ 1267.17ms]:Check Diff : 0.000000 | 0.000000e+00

Architektura

Processing flow

Processing grid

Execution

Precision (IEEE 754)
GeForce 7xx
FP64:FP32
float: 24E82 (16ME127)
double: 53E112
CUDA math
Porovnanie schopnosti roznych generacii GPU

Code

typedef float2 dblfloat; // .y = head, .x = tail __host__ __device__ __forceinline__ dblfloat mul_dblfloat (dblfloat x, dblfloat y) { dblfloat t, z; float sum; t.y = x.y * y.y; t.x = fmaf (x.y, y.y, -t.y); t.x = fmaf (x.x, y.x, t.x); t.x = fmaf (x.y, y.x, t.x); t.x = fmaf (x.x, y.y, t.x); /* normalize result */ sum = t.y + t.x; z.x = (t.y - sum) + t.x; z.y = sum; return z; }

Matrix Multiplication

Code

__global__ void kernel_mmul(FLOATING *a,FLOATING *b,FLOATING *tgt,uint size){ uint x=blockDim.x*blockIdx.x + threadIdx.x; uint y=blockDim.y*blockIdx.y + threadIdx.y; uint i; FLOATING val; if (x<size && y<size){ val=0; for(i=0;i<size;i++) val += a[i+y*size] * b[x+i*size]; tgt[x+y*size]=val; } } void d_mmul(FLOATING *h_a,FLOATING *h_b,FLOATING *h_tgt,uint size){ cudaSetDevice(DEVICE); FLOATING *d_a,*d_b,*d_tgt; Ticker tick; dim3 gdim,bdim; int bx=32,by=16; cudaMalloc((void**)&d_a, SOF*size*size); cudaMalloc((void**)&d_b, SOF*size*size); cudaMalloc((void**)&d_tgt, SOF*size*size); cudaMemcpy(d_a,h_a,size*size*SOF,H2D); cudaMemcpy(d_b,h_b,size*size*SOF,H2D); bdim = dim3(bx,by,1); gdim = dim3((size+bx-1)/bx,(size+by-1)/by,1); tick.tick(); kernel_mmul<<<gdim,bdim>>>(d_a,d_b,d_tgt,size); cudaDeviceSynchronize(); tick.tick("Exec"); cudaMemcpy(h_tgt,d_tgt,size*size*SOF,D2H); cudaFree(d_a); cudaFree(d_b); cudaFree(d_tgt); CUDAEXITONERROR("Done"); } void mmul(){ Ticker tick; uint size=1024; FLOATING *a,*b,*dev,*host,rv,tmp; a = new FLOATING[size*size]; b = new FLOATING[size*size]; dev = new FLOATING[size*size]; host = new FLOATING[size*size]; srand(time(nullptr)); tick.tick(); for(uint i=0;i<size*size;i++){ a[i]=(rand()%87355)/87355.0; b[i]=(rand()%87355)/87355.0; } tick.tick("Gen"); for(uint x=0;x<size;x++) for(uint y=0;y<size;y++){ tmp=0; for(uint k=0;k<size;k++) tmp+=a[k+y*size]*b[x+k*size]; host[x+y*size]=tmp; } tick.tick("Host"); d_mmul(a,b,dev,size); tick.tick("CUDA"); rv = diff(host,dev,size*size); tick.tick("Check"); printf("Diff : %f | %e\n",rv,rv/(size*size)); delete a; delete b; delete host; delete dev; } int main(){ mmul(); }

./main [I][ 58.49ms]:Gen [I][ 9401.92ms]:Host [I][ 13.63ms]:Exec [I][ 341.94ms]:CUDA [I][ 25.20ms]:Check Diff : 7.629135 | 7.275710e-06
-DDOUBLE_PRECISION
./main [I][ 59.34ms]:Gen [I][ 9279.25ms]:Host [I][ 24.06ms]:Exec [I][ 415.43ms]:CUDA [I][ 23.22ms]:Check Diff : 0.000000 | 1.359871e-14

Zdielana pamat

Pri nasobeni matic nasobime riadok stlpcom (a[i][k] * b[k][j]), skusme to dekonstruovat

"Nahodny" pristup do pamati je pomaly!

Code

#define TILE 32 __global__ void kernel_mmul_SM(FLOATING *a,FLOATING *b,FLOATING *tgt,uint size){ __shared__ FLOATING sha[TILE*TILE_PT]; __shared__ FLOATING shb[TILE*TILE_PT]; uint x=blockDim.x*blockIdx.x + threadIdx.x; uint y=blockDim.y*blockIdx.y + threadIdx.y; uint k,win; FLOATING val=0; for(win=0;win<size;win+=TILE){ sha[threadIdx.x+threadIdx.y*TILE_PT]=(y<size&&win+threadIdx.x<size?a[threadIdx.x+win+y*size]:0); shb[threadIdx.x+threadIdx.y*TILE_PT]=(x<size&&win+threadIdx.y<size?b[x+(threadIdx.y+win)*size]:0); __syncthreads(); for(k=0;k<TILE;k++) val+=sha[k+threadIdx.y*TILE_PT]*shb[k*TILE_PT+threadIdx.x]; __syncthreads(); } if (x<size && y<size) tgt[x+y*size]=val; } void d_mmul(FLOATING *h_a,FLOATING *h_b,FLOATING *h_tgt,uint size){ cudaSetDevice(DEVICE); FLOATING *d_a,*d_b,*d_tgt; Ticker tick; dim3 gdimt,bdimt; int tile; tile=TILE; cudaMalloc((void**)&d_a, SOF*size*size); cudaMalloc((void**)&d_b, SOF*size*size); cudaMalloc((void**)&d_tgt, SOF*size*size); cudaMemcpy(d_a,h_a,size*size*SOF,H2D); cudaMemcpy(d_b,h_b,size*size*SOF,H2D); bdimt = dim3(tile,tile,1); gdimt = dim3((size+tile-1)/tile,(size+tile-1)/tile,1); tick.tick(); kernel_mmul_SM<<<gdimt,bdimt>>>(d_a,d_b,d_tgt,size); cudaDeviceSynchronize(); tick.tick("Exec"); cudaMemcpy(h_tgt,d_tgt,size*size*SOF,D2H); cudaFree(d_a); cudaFree(d_b); cudaFree(d_tgt); CUDAEXITONERROR("Done"); }

Code

__global__ void kernel_mmul_SMD(FLOATING *a,FLOATING *b,FLOATING *tgt,uint size,uint tile){ extern __shared__ float sh[]; uint x=blockDim.x*blockIdx.x + threadIdx.x; uint y=blockDim.y*blockIdx.y + threadIdx.y; uint k,win; FLOATING val=0; for(win=0;win<size;win+=tile){ sh[threadIdx.x+threadIdx.y*tile]=(y<size&&win+threadIdx.x<size?a[threadIdx.x+win+y*size]:0); sh[tile*tile+threadIdx.x+threadIdx.y*tile]=(x<size&&win+threadIdx.y<size?b[x+(threadIdx.y+win)*size]:0); __syncthreads(); for(k=0;k<tile;k++) val+=sh[k+threadIdx.y*tile]*sh[tile*tile+k*tile+threadIdx.x]; __syncthreads(); } if (x<size && y<size) tgt[x+y*size]=val; } kernel_mmul_SMD<<<gdimt,bdimt,tile*tile*2*SOF>>>(d_a,d_b,d_tgt,size,tile);

Zadania: [Odovzdajte okomentovane riesenie uloh 9-14]

  1. Ziskajte pristup ku kontu studentX , zmente heslo (prikaz passwd) a vytvorte subor obsahujuci Vase meno
    ssh studentX@158.197.31.56 -p 222
    passwd
    > meno
  2. Pokial ste zabudli pridelene cislo, mozete skusat v rozsahu student1 az student20, ci ostalo konto s povodnym heslom
  3. Vytvorte subor s Prvym programom Vektor * skalar (skopirovanim textu z tohto materialu)
  4. Program skompilujte a spustajte pomocou skriptu curun ./main
  5. Vyskusajte vplyv prepinaca -O3 na cas behu vypoctu na CPU
  6. Vyskusajte vplyv prepinaca DOUBLE_PRECISION na cas behu vypoctu na CPU a GPU
  7. Vyskusajte vplyv pouzitia cudaMallocHost((void**)&d_a,SOF*size); namiesto a=new FLOATING[size]; a cudaFreeHost(d_a); namiesto delete a; (ditto pole device) (pamat sa priradi kontinualne)
  8. Pouzite profilaciu - spustenie so skriptom curun nvprof ./main
  9. [1b] Okomentujte zistene pozorovania z bodov 5. - 8.
  10. [1b] Upravte program tak, aby pocital Hadamardov sucin (po elementoch) dvoch vektorov (c[i]=a[i]*b[i]).
    Popiste upravu a okomentujte vysledok
  11. [1b] Upravte program tak, aby namiesto sucinu a b pocitanie funkciu:
    if (b[i]>0.5) c[i]=SIN(a[i])*COS(b[i]);else c[i]=COS(a[i])*SIN(b[i]);
    Porovnajte vysledky s tymto zapisom funkcie a s optimalizovanym zapisom funkcie (podla prezentacie - bez if)
  12. [2b] Skopirujte a prelozte program na nasobenie matic so standardnym postupom nasobenia (s jadrom mmul v globalnej pamati a jadrom mmul_SM v zdielanej pamati) a porovnajte vysledky so sekvencnym riesenim. Skuste upravit tieto programy (sekvencny, paralelny mmul a paralelny mmul_SM) tak, aby vyuzivali nasobenie po riadkoch (ako su postupne ulozene hodnoty v pamati). Vysledky porovnajte (mozete predpokladat ze matica B je uz vygenerovana v transponovanom tvare).
  13. [1b] Skopirujte zdrojove texty z teaching.cu a skompilujte ich na serveri.
    Komentujte ziskane vysledky vsetkych funkcii. Preco v pripade funkcie "add_global" vznika taka velka chyba ?
  14. [2b] Upravte kernel "kernel_add_shuffle_block" tak, aby ho na vypocet sumy pola stacilo volat
    vo funkcii "d_add_shuffle_block" iba raz (okomentujte pridanu cast do kernelu).