I'm trying to maintain static buffers on the accelerator to avoid repeated mallocs, since my OpenCL code gives substantial performance improvements with it. So I have a setup where I have an array<float, 1> for gpu buffers, and a staging array to help speed up copies. To chain the copy operations (CPU->staging->GPU buffer), I'm using continuations with completion_future. However, my code crashes with a memory access error on the copy back (GPU buffer -> staging -> CPU memory). In addition, the crashes are not deterministic, they happen at varying iterations. I've checked that the queued operations are trying to access the correct memory address, and there is sufficient memory allocated for it. Here's my code:
#include <amp.h> #include <amp_math.h> #include <iostream> #include <vector> #include <cstdlib> using namespace std; using namespace concurrency; #define FRAND (static_cast<float>(rand()) / RAND_MAX) #define FSRAND (FRAND - 0.5f) #define USE_COMPLETIONS void testfunc () { const int n = 10000, allocn = 20000; static array<float, 1> staging[] = {array<float, 1>(n, accelerator(accelerator::cpu_accelerator).default_view, accelerator(accelerator::default_accelerator).default_view), array<float, 1>(n, accelerator(accelerator::cpu_accelerator).default_view, accelerator(accelerator::default_accelerator).default_view), array<float, 1>(n, accelerator(accelerator::cpu_accelerator).default_view, accelerator(accelerator::default_accelerator).default_view), }; static array<float, 1> gpumem[] = {array<float, 1>(allocn), array<float, 1>(allocn), array<float, 1>(allocn)}; static bool isinit = false; static vector<float> a(n), b(n), c(n), d(n); if (!isinit) { a.resize(n); b.resize(n); c.resize(n); for (int i = 0; i < n; ++i) {a[i] = FSRAND; b[i] = FSRAND; c[i] = FSRAND;} } vector<float>::iterator worklists[] = {a.begin(), b.begin(), c.begin(), a.end(), b.end(), c.end()}; vector<float> *wlptrs[] = {&a, &b, &c}; cout << "init complete" << endl; #ifdef USE_COMPLETIONS for (int i = 0; i < 3; ++i) try { completion_future cf = copy_async (worklists[i], worklists[3+i], staging[i]); cf.then([&, i] { cout << "Continuing copyinit " << i << endl; cout.flush(); copy_async(staging[i], gpumem[i]).wait(); }); //cf.get(); } catch (std::exception ex) {cout << "Caught exception in copyinit: " << ex.what() << endl;cout.flush();} #else shared_future<void> waitlists[3]; for (int i = 0; i < 3; ++i) try { waitlists[i] = copy_async (worklists[i], worklists[3+i], staging[i]); } catch (std::exception ex) {cout << "Caught exception in copyinit: " << ex.what() << endl;} for (int i = 0; i < 3; ++i) waitlists[i].wait(); for (int i = 0; i < 3; ++i) try { waitlists[i] = copy_async (staging[i], gpumem[i]); } catch (std::exception ex) {cout << "Caught exception in copyinit: " << ex.what() << endl;} for (int i = 0; i < 3; ++i) waitlists[i].wait(); #endif array<float, 1> &ga = gpumem[0], &gb = gpumem[1], &gc = gpumem[2]; cout << "transfers to gpu scheduled" << endl; cout.flush(); try { parallel_for_each (concurrency::extent<1>(n), [&](index<1> idx) restrict(amp) { gc[idx] = ga[idx] + gb[idx]; ga[idx] = gb[idx]; gb[idx] = gc[idx]; } ); } catch (std::exception ex) { cout << "Caught exception in kernel: " << ex.what() << endl;cout.flush(); } cout << "kernel scheduled" << endl;cout.flush(); #ifdef USE_COMPLETIONS for (int i = 0; i < 3; ++i) try { completion_future cf = copy_async (gpumem[i].section(0, n), staging[i]); cf.then([&, i] { cout << "Continuing copyback " << i << " trying to copy to vector at " << (void*)wlptrs[i] << " of size " << wlptrs[i]->size() << " with ptr range: " << &(*wlptrs[i])[0] << ", " << &(wlptrs[i]->back()) << endl; copy_async(staging[i], wlptrs[i]->begin()).wait(); }); //cf.get(); } catch (std::exception ex) {cout << "Caught exception in copyback: " << ex.what() << endl;cout.flush();} #else for (int i = 0; i < 3; ++i) try { waitlists[i] = copy_async (gpumem[i].section(0,n), staging[i]); } catch (std::exception ex) {cout << "Caught exception in copyback: " << ex.what() << endl;} for (int i = 0; i < 3; ++i) waitlists[i].wait(); for (int i = 0; i < 3; ++i) try { waitlists[i] = copy_async (staging[i], worklists[i]); } catch (std::exception ex) {cout << "Caught exception in copyback: " << ex.what() << endl;} for (int i = 0; i < 3; ++i) waitlists[i].wait(); #endif cout << "transfer back scheduled" << endl;cout.flush(); } int main (int argc, char **argv) { srand(0); for (int i = 0; i < 10000; ++i) { testfunc(); } return 0; }Could someone help point out what I'm doing wrong in this example? I'm only using continuations for copies, and I have the regular copy code in there too to compare. This is a knocked down version of what I'm doing, I actually have a lot more buffers and a more complicated kernel. That's why I'm trying to use loops for all the copies.