首页
登录 | 注册

SSE加速实战之 二阶范数计算

这里使用了纯C,SSE c++加速版本,SSE 汇编加速版本测试

求二维向量二阶范数的计算:平方和求平方根

结果使用纯C反而更快环境WIN10 + VS2013

结果是

arrayCalcCPP 0.040ms
arrayCalcSSE 0.207ms
arrayCalcSSE2 0.208ms
arrayCalcSSEASM 0.207ms


使用编译器优化的纯C 比 手动写的并行加速还要快5倍

如果有网友能说出具体原因,欢迎留言~


void arrayCalcCPP(F32 *p1, F32 *p2, F32 *pSum, S32 len)
{
    S32 i = 0;

    for (i = 0; i < len; i++)
    {
        pSum[i] = sqrtf(p1[i] * p1[i] + p2[i] * p2[i]);
    }
}

void arrayCalcSSE(F32 *p1, F32 *p2, F32 *pSum, S32 len)
{
    S32 i = 0;
    S32 end = len >> 2;

    __m128 m1, m2, m3;

    __m128 *_p1 = (__m128 *)p1;
    __m128 *_p2 = (__m128 *)p2;
    __m128 *_psum = (__m128 *)pSum;

    for (i = 0; i < end; i++)
    {
        m1 = _mm_mul_ps(*_p1, *_p1);
        m2 = _mm_mul_ps(*_p2, *_p2);
        m3 = _mm_add_ps(m1, m2);
        *_psum = _mm_sqrt_ps(m3);

        _p1++;
        _p2++;
        _psum++;
    }
}

void arrayCalcSSE2(F32 *p1, F32 *p2, F32 *pSum, S32 len)
{
    S32 i = 0;
    S32 end = len >> 2;

    __m128 m1, m2, m3;

    __m128 *_p1 = (__m128 *)p1;
    __m128 *_p2 = (__m128 *)p2;
    __m128 *_psum = (__m128 *)pSum;

    for (i = 0; i < end; i++)
    {
        m1 = _mm_load_ps(p1);
        m2 = _mm_load_ps(p2);
        m1 = _mm_mul_ps(m1, m1);
        m2 = _mm_mul_ps(m2, m2);
        m3 = _mm_add_ps(m1, m2);
        m3 = _mm_sqrt_ps(m3);
        _mm_store_ps(pSum, m3);

        p1 += 4;
        p2 += 4;
        pSum += 4;
    }
}

void arrayCalcSSEASM(F32 *p1, F32 *p2, F32 *pSum, S32 len)
{
    S32 i = 0;
    S32 end = len >> 2;

    _asm
    {
        mov esi, p1 // 输入的源数组1的地址送往esi
        mov edx, p2 // 输入的源数组2的地址送往edx

        mov edi, pSum // 输出结果数组的地址保存在edi
        mov ecx, end //循环次数送往ecx

        start_loop :
        movaps xmm0, [esi] // xmm0 = [esi]
        mulps xmm0, xmm0 // xmm0 = xmm0 * xmm0

        movaps xmm1, [edx] // xmm1 = [edx]
        mulps xmm1, xmm1 // xmm1 = xmm1 * xmm1

        addps xmm0, xmm1 // xmm0 = xmm0 + xmm1
        sqrtps xmm0, xmm0 // xmm0 = sqrt(xmm0)

        movaps[edi], xmm0 // [edi] = xmm0

        add esi, 16 // esi += 16
        add edx, 16 // edx += 16
        add edi, 16 // edi += 16

        dec ecx // ecx--
        jnz start_loop //如果不为0则转向start_loop
    }
}

#define ARRAY_LENGTH 100000
void SSE_test()
{
    F32 *array1 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
    F32 *array2 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
    F32 *array3 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
    F32 *array4 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
    F32 *array5 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);

    for (int i = 0; i < ARRAY_LENGTH; i++)
    {
        array1[i] = i * 1.f;
        array2[i] = i * 1.f;
    } 

    arrayCalcCPP(array1, array2, array3, ARRAY_LENGTH);
    arrayCalcSSE(array1, array2, array4, ARRAY_LENGTH);
    arrayCalcSSEASM(array1, array2, array5, ARRAY_LENGTH);
}

void SSE_testP()
{
    F32 *array1 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
    F32 *array2 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
    F32 *array3 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
    F32 *array4 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);
    F32 *array5 = (F32 *)GP_ALIGN_MALLOC16(sizeof(F32) * ARRAY_LENGTH);

    for (int i = 0; i < ARRAY_LENGTH; i++)
    {
        array1[i] = i * 1.f;
        array2[i] = i * 1.f;
    }

    double t;

    t = (double)getTickCount();
    for (int i = 0; i < CYC_NUM; i++)
    {
        arrayCalcCPP(array1, array2, array3, ARRAY_LENGTH);
    }
    t = ((double)getTickCount() - t) * 1000 / CYC_NUM / getTickFrequency();
    printf("arrayCalcCPP %.3fms\n", t);

    t = (double)getTickCount();
    for (int i = 0; i < CYC_NUM; i++)
    {
        arrayCalcSSE(array1, array2, array4, ARRAY_LENGTH);
    }
    t = ((double)getTickCount() - t) * 1000 / CYC_NUM / getTickFrequency();
    printf("arrayCalcSSE %.3fms\n", t);

    t = (double)getTickCount();
    for (int i = 0; i < CYC_NUM; i++)
    {
        arrayCalcSSE2(array1, array2, array4, ARRAY_LENGTH);
    }
    t = ((double)getTickCount() - t) * 1000 / CYC_NUM / getTickFrequency();
    printf("arrayCalcSSE2 %.3fms\n", t);

    t = (double)getTickCount();
    for (int i = 0; i < CYC_NUM; i++)
    {
        arrayCalcSSEASM(array1, array2, array5, ARRAY_LENGTH);
    }
    t = ((double)getTickCount() - t) * 1000 / CYC_NUM / getTickFrequency();
    printf("arrayCalcSSEASM %.3fms\n", t);
}




2020 jeepxie.net webmaster#jeepxie.net
10 q. 0.008 s.
京ICP备10005923号