Cảm ơn bạn rất nhiều vì đã đọc chủ đề của tôi.cudaDeviceSynchronize() mã lỗi 77: cudaErrorIllegalAddress
Tôi đang làm công việc CUDA, nhưng tiếp tục nhận được mã lỗi cudaDeviceSynchronize() 77: cudaErrorIllegalAddress, không biết tại sao. Tôi đã tìm kiếm cả mã và chức năng, đáng ngạc nhiên, chỉ có một vài hồ sơ xuất hiện. Rất lạ.
Tôi về cơ bản tổng hợp tất cả các pixel của hình ảnh. Để làm cho câu hỏi của tôi có nhiều tài liệu tham khảo vì nó có thể, tôi thấy tất cả các mã CUDA của tôi ở đây: không gian bộ nhớ đủ và tương đương
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "thorcalgpu.h"
#include <stdio.h>
#include "math.h"
#include <vector>
#include <algorithm>
#include <stdlib.h>
#include <stdio.h>
#include <vector>
#include <numeric>
#include <iostream>
using namespace std;
float random_float(void)
{
return static_cast<float>(rand())/RAND_MAX;
}
__global__ void reduceSum(unsigned short *input,
unsigned long long *per_block_results,
const int n)
{
extern __shared__ unsigned long long sdata[];
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
// load input into __shared__ memory
unsigned short x = 0;
if(i < n)
{
x = input[i];
}
sdata[threadIdx.x] = x;
__syncthreads();
// contiguous range pattern
for(int offset = blockDim.x/2; offset > 0; offset >>= 1)
{
if(threadIdx.x < offset)
{
// add a partial sum upstream to our own
sdata[threadIdx.x] += sdata[threadIdx.x + offset];
}
// wait until all threads in the block have
// updated their partial sums
__syncthreads();
}
// thread 0 writes the final result
if(threadIdx.x == 0)
{
per_block_results[blockIdx.x] = sdata[0];
}
}
// Helper function for using CUDA to add vectors in parallel.
//template <class T>
cudaError_t gpuWrapper(float *mean, int N, vector<string> filelist)
{
int size = N*N;
unsigned long long* dev_sum = 0;
unsigned short* dev_img = 0;
cudaError_t cudaStatus;
const int block_size = 512;
const int num_blocks = (size/block_size) + ((size%block_size) ? 1 : 0);
int L = filelist.size();
// Choose which GPU to run on, change this on a multi-GPU system.
double totalgpuinittime = 0;
StartCounter(7);
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_img, size * sizeof(unsigned short));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_sum, num_blocks*sizeof(unsigned long long));
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
totalgpuinittime = GetCounter(7);
unsigned short* img;
unsigned short* pimg;
unsigned long long* sum = new unsigned long long[num_blocks];
unsigned long long* psum = sum;
cout<<endl;
cout << "gpu looping starts, and in progress ..." << endl;
StartCounter(6);
double totalfileiotime = 0;
double totalh2dcpytime = 0;
double totalkerneltime = 0;
double totald2hcpytime = 0;
double totalcpusumtime = 0;
double totalloopingtime = 0;
for (int k = 0; k < L; k++)
{
StartCounter(1);
img = (unsigned short*)LoadTIFF(filelist[k].c_str());
totalfileiotime += GetCounter(1);
psum = sum;
pimg = img;
float gpumean = 0;
memset(psum, 0, sizeof(unsigned long long)*num_blocks);
StartCounter(2);
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_img, pimg, size * sizeof(unsigned short), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
cudaStatus = cudaMemcpy(dev_sum, psum, num_blocks*sizeof(unsigned long long), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
totalh2dcpytime += GetCounter(2);
StartCounter(3);
//reduceSum<<<num_blocks,block_size,num_blocks * sizeof(unsigned long long)>>>(dev_img, dev_sum, size);
//reduceSum<<<num_blocks,block_size,block_size * sizeof(unsigned short)>>>(dev_img, dev_sum, size);
reduceSum<<<num_blocks,block_size>>>(dev_img, dev_sum, size);
totalkerneltime += GetCounter(3);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "reduction Kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
// !!!!!! following is where the code 77 error occurs!!!!!!!
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
StartCounter(4);
cudaStatus = cudaMemcpy(psum, dev_sum, num_blocks * sizeof(unsigned long long), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
totald2hcpytime += GetCounter(4);
StartCounter(5);
for (int i = 0; i < num_blocks; i++)
{
gpumean += *psum;
psum++;
}
gpumean /= N*N;
totalcpusumtime += GetCounter(5);
delete img;
img = NULL;
cout<<gpumean<<endl;
}
int S = 1e+6;
int F = filelist.size();
float R = S/F;
totalloopingtime = GetCounter(6);
cout<<"gpu looping ends."<<endl<<endl;
cout<< "analysis:"<<endl;
cout<<"gpu initialization time: "<<totalgpuinittime<<" sec"<<endl<<endl;
cout<<"file I/O time: "<<endl;
cout<<" total "<<totalfileiotime<<" sec | average "<<totalfileiotime*R<<" usec/frame"<<endl<<endl;
cout<<"host-to-device copy time: "<<endl;
cout<<" total "<<totalh2dcpytime<<" sec | average "<<totalh2dcpytime*R<<" usec/frame"<<endl<<endl;
cout<<"pure gpu kerneling time: "<<endl;
cout<<" total "<<totalkerneltime<<" sec | average "<<totalkerneltime*R<<" usec/frame"<<endl<<endl;
cout<<"device-to-host copy time: "<<endl;
cout<<" total "<<totald2hcpytime<<" sec | average "<<totald2hcpytime*R<<" usec/frame"<<endl<<endl;
/*cout<<"cpu summing time: "<<endl;
cout<<" total: "<<totalcpusumtime<<" sec | average: "<<totalcpusumtime*R<<" usec/frame"<<endl<<endl;;*/
/*cout <<"gpu looping time: " << endl;
cout<<" total: "<<totalloopingtime<<" sec | average: "<<totalloopingtime*R<<" usec/frame"<<endl;*/
Error:
cudaFree(dev_sum);
cudaFree(dev_img);
delete sum;
sum = NULL;
return cudaStatus;
}
void kernel(float* &mean, int N, vector<string> filelist)
{
// wrapper and kernel
cudaError_t cudaStatus = gpuWrapper(mean, N, filelist);
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "gpuWapper failed!");
}
// printf("mean is: %f\n", mean);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
StartCounter(8);
cudaStatus = cudaDeviceReset();
if (cudaStatus != cudaSuccess)
{
fprintf(stderr, "cudaDeviceReset failed!");
}
cout<<"gpu reset time: "<<GetCounter(8)<<" sec"<<endl<<endl;
//return *mean;
}
tôi đã gán cho cả máy và bộ nhớ điện thoại. Bất kỳ ý kiến được đánh giá cao.