47 constexpr const int runs = 2;
48 std::array<GPUEvent, runs+1> events;
51 float bestTime = std::numeric_limits<float>::max();
52 int bestBlockSize = -1;
56 for (
int thrBlockSize = interval; thrBlockSize <= 1024; thrBlockSize += interval) {
59 OPM_GPU_SAFE_CALL(cudaEventRecord(events[0].get()));
60 for (
int i = 0; i < runs; ++i) {
62 OPM_GPU_SAFE_CALL(cudaEventRecord(events[i + 1].get()));
66 OPM_GPU_SAFE_CALL(cudaEventSynchronize(events[runs].get()));
69 if (cudaSuccess == cudaGetLastError()) {
71 for (
int i = 0; i < runs; ++i) {
72 float candidateBlockSizeTime;
73 OPM_GPU_SAFE_CALL(cudaEventElapsedTime(&candidateBlockSizeTime, events[i].get(), events[i + 1].get()));
74 if (candidateBlockSizeTime < bestTime) {
75 bestTime = candidateBlockSizeTime;
76 bestBlockSize = thrBlockSize;
83 fmt::format(
"[Kernel tuning completed] {}: Tuned Blocksize = {}, Fastest Runtime = {}ms.", descriptionOfFunction, bestBlockSize, bestTime));