Mysterious crash with OpenMP and AVX2 vector intrinsics

1 year ago

#351639

drakon101

I have the following two functions:

void bfm(const Parameters& p, int& idx, Eigen::Ref<Eigen::MatrixXd> bfFrame,
         const Eigen::Ref<const Eigen::Matrix<short int, -1, -1>>& SIG) {

    #pragma omp parallel for
    for (int i = 0; i < p.nPoints; i++) {
        Eigen::VectorXd idxt(p.numEl);     
        int col = i / p.szZ;
        int row = i % p.szZ;
        calcIDXT(p,p.xCoord(col),p.zCoord(row),idx,idxt);// Calculate delay indices
        calcPoint(p,idxt,SIG,bfFrame(row,col));
    }
    
}

and:

void calcPoint(const Parameters& p, const Eigen::Ref<const Eigen::VectorXd>& idxt,
               const Eigen::Ref<const Eigen::Matrix<short int, -1, -1>>& SIG,
               double& bfVal) {
    Eigen::VectorXd bfVec = Eigen::VectorXd::Zero(p.nc);
    for (int j = 0; j < p.nc; j+=4) {
        __m256d res = linearAVX( (const double*) (idxt.data()+j));//, (const int16_t*) (SIG.data() + j * p.szRF),p.szRF);
        _mm256_storeu_pd(bfVec.data()+j,_mm256_loadu_pd( idxt.data() + j ));
    }
    bfVal = bfVec.sum();
}

Right now, the function linearAVX essentially only calls _mm256_loadu_pd( idxt.data() + j) and returns that. The rest of the function is commented out. The casting in the parameters for linearAVX is also redundant (I think) and placed there for debugging purposes.

For some reason, this code compiles fine but crashes at runtime. I can avoid a crash if I comment out the call to linearAVX or I can avoid a crash if I comment out the #pragma omp line. What about linearAVX is not thread safe?

linearAVX code:

// Perform linear interpolation using AVX instructions.
//      pIndex  - pointer to value in idxt array
//      pRFCol  - pointer to first channel of RF data
//      N       - number of elements in a single RF channel
__m256d linearAVX( const double* pIndex)//, const int16_t* pRFCol, int N)
{
    // Load 4 idxt values
    __m256d idxt = _mm256_loadu_pd( pIndex );
    
    // Compute idxf and idxd values
//     __m256d idxf = _mm256_floor_pd( idxt );
//     __m256d idxd = _mm256_sub_pd( idxt, idxf );
// 
//     // Compute s1...sN
//     __m256d s1 = _mm256_sub_pd( _mm256_set1_pd( 1.0 ), idxd ); // Note, s2 = idxd for linear interp
    
//     // Gather RF values
//     int base_addr = static_cast<int>(*pIndex);
//     __m128i vindex = _mm_set_epi32(0,N-base_addr,2*N-base_addr,3*N-base_addr);        // NOTE: CHECK THAT THIS IS CORRECT WAY TO PASS VALUES
//     __m128i rfVals = _mm_i32gather_epi32( (const int*) (base_addr + pRFCol),vindex,2);
// 
//     // Upconvert, shuffle and separate
//     __m256i rfUpconv = _mm256_cvtepi16_epi32(rfVals);
//     __m256i rfShuf = _mm256_permute4x64_epi64(_mm256_shuffle_epi32(rfUpconv,216),216); // 216 = 0b11011000 in binary, 0xD8 in Hex. See documentation
//     // Additional note: using _mm256_permutevar8x32_epi32 may be faster? Experiment with this and find out later
// 
//     __m256d rf1 = _mm256_cvtepi32_pd(_mm256_extracti128_si256(rfShuf,0));
//     __m256d rf2 = _mm256_cvtepi32_pd(_mm256_extracti128_si256(rfShuf,1));
// 
//     // Multiply coeffs and sum to get result
//     __m256d result = _mm256_add_pd(_mm256_mul_pd(rf1,s1),_mm256_mul_pd(rf2,idxd));
// 
//     return result;
    
    return idxt;

}

c++

openmp

simd

avx2

0 Answers

Your Answer

Posts

Questions

Blogs

Jobs