Convert yuv nv12 to icm2 C snippets

Description:
Convert YUV nv12 (interleaved uv samples) to ICM2 (line interleaved uv). For a description of different formats see fourcc.org/yuv Plain C and intel sse vectorized.
Copyright
2014, Torbjørn Tyridal
License:
BSD-3-Clause
[yuv_convert.c]
/*
Functions for converting yuv nv12 formated images to yuv icm2
(refer to http://www.fourcc.org/yuv.php for explanation of formats)

Copyright (c) 2014, Torbjørn Tyridal
All rights reserved.

Licensed under BSD-3-Clause ("BSD new"):

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
    * Redistributions of source code must retain the above copyright
      notice, this list of conditions and the following disclaimer.
    * Redistributions in binary form must reproduce the above copyright
      notice, this list of conditions and the following disclaimer in the
      documentation and/or other materials provided with the distribution.
    * Neither the name of the <organization> nor the
      names of its contributors may be used to endorse or promote products
      derived from this software without specific prior written permission.

THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/

#include <assert.h>
#include <stddef.h>
#include <stdint.h>

#ifdef __SSE3__
#include "tmmintrin.h"
#include "pmmintrin.h"
#endif
#ifdef __SSE4_1__
#include "smmintrin.h"
#endif

void nv12_to_icm2_v(const unsigned long long * restrict nv_uv,
        unsigned long long * restrict icm2_uv,
        unsigned uv_stride, unsigned height)
{
    assert(uv_stride % 32 == 0);
    assert(height % 2 == 0);
    __m128i zero = _mm_setzero_si128();

    height/=2;
    unsigned end =  uv_stride/ sizeof(__m128i);
    for (unsigned y=0; y < height; y++) {
        for (unsigned x=0; x < end; x++) {
            __m128i r = _mm_load_si128((__m128i*)&nv_uv[2*x]);

            __m128i r_lo = _mm_unpacklo_epi8(r, zero); // u0,v0,u1,v1 ,u2,v2,u3,v3 (shorts)
            __m128i r_hi = _mm_unpackhi_epi8(r, zero); // u4,v4,u5,v5 ,u6,v6,u7,v7

            r_lo = _mm_shufflelo_epi16(r_lo, _MM_SHUFFLE(3,1,2,0)); //u0u1 v0v1 u2u3 v2v3
            r_lo = _mm_shufflehi_epi16(r_lo, _MM_SHUFFLE(3,1,2,0));
            r_lo = _mm_shuffle_epi32(r_lo, _MM_SHUFFLE(3,1,2,0)); //u0u1u2u3 v0v1v2v3
            r_hi = _mm_shufflelo_epi16(r_hi, _MM_SHUFFLE(3,1,2,0));
            r_hi = _mm_shufflehi_epi16(r_hi, _MM_SHUFFLE(3,1,2,0));
            r_hi = _mm_shuffle_epi32(r_hi, _MM_SHUFFLE(3,1,2,0)); //u4u5u6u7 v4v5v6v7

            __m128i u = _mm_unpacklo_epi64(r_lo, r_hi); //v0v1v2v3 v4v5v6v7
            __m128i v = _mm_unpackhi_epi64(r_lo, r_hi); //u0u1u2u3 u4u5u6u7
            u = _mm_packus_epi16 (u, zero); //(back to u8)
            v = _mm_packus_epi16 (v, zero);

            _mm_storel_epi64((__m128i*)&icm2_uv[x], u);
            _mm_storel_epi64((__m128i*)&icm2_uv[x+uv_stride/2/sizeof(*icm2_uv)], v);
        }
    }
}

union longlong_char {
    unsigned long long ul;
    char c[8];
};
void nv12_to_icm2_2(const unsigned long long * restrict nv_uv,
        unsigned long long * restrict icm2_uv,
        unsigned uv_stride, unsigned height)
{
    assert(uv_stride % 32 == 0);
    assert(height % 2 == 0);

    height/=2;
    unsigned end =  uv_stride/2 / sizeof(*nv_uv);
    for (unsigned y=0; y < height; y++) {
        for (unsigned x=0; x < end; x++) {
            union longlong_char r1 = {*nv_uv++};
            union longlong_char r2 = {*nv_uv++};
            union longlong_char u,v;

            u.c[0] = r1.c[0];
            u.c[1] = r1.c[2];
            u.c[2] = r1.c[4];
            u.c[3] = r1.c[6];
            u.c[4] = r2.c[0];
            u.c[5] = r2.c[2];
            u.c[6] = r2.c[4];
            u.c[7] = r2.c[6];
            v.c[0] = r1.c[1];
            v.c[1] = r1.c[3];
            v.c[2] = r1.c[5];
            v.c[3] = r1.c[7];
            v.c[4] = r2.c[1];
            v.c[5] = r2.c[3];
            v.c[6] = r2.c[5];
            v.c[7] = r2.c[7];

            icm2_uv[x] = u.ul;
            icm2_uv[x+uv_stride/2/sizeof(*icm2_uv)] = v.ul;
        }
    }
}

#define nthByte(r, n) ((r>>(8*n))&0xff)
void nv12_to_icm2_1(const unsigned long long * restrict nv_uv,
        unsigned long long * restrict icm2_uv,
        unsigned uv_stride, unsigned height)
{
    assert(uv_stride % 32 == 0);
    assert(height % 2 == 0);

    height/=2;
    unsigned end =  uv_stride/2 / sizeof(*nv_uv);
    for (unsigned y=0; y < height; y++) {
        for (unsigned x=0; x < end; x++) {
            unsigned long long r1 = nv_uv[2*x];
            unsigned long long r2 = nv_uv[2*x+1];
            unsigned long long u = (nthByte(r1,0) << 0) | (nthByte(r1,2) << 8) |
                                   (nthByte(r1,4) << 16) | (nthByte(r1,6) << 24) |
                                   (nthByte(r2,0) <<32) | (nthByte(r2,2) <<40) |
                                   (nthByte(r2,4) << 48) | (nthByte(r2,6) << 56);
            unsigned long long v = (nthByte(r1,1) << 0) | (nthByte(r1,3) << 8) |
                                   (nthByte(r1,5) << 16) | (nthByte(r1,7) << 24) |
                                   (nthByte(r2,1) <<32) | (nthByte(r2,3) <<40) |
                                   (nthByte(r2,5) << 48) | (nthByte(r2,7) << 56);

            icm2_uv[x] = u;
            icm2_uv[x+uv_stride/2/sizeof(*icm2_uv)] = v;
        }
    }
}

#if IS_MAIN
#include <string.h>
#include <stdio.h>
#include <time.h>

union aligned_bytes{
    unsigned char buf[32];
    unsigned long long aslonglong;
    unsigned long aslong;
    unsigned asint;
    __m128i  asvect;
};

int main(void) {
    union aligned_bytes nv = {{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32}};
    union aligned_bytes icm = {{0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0}};
    const union aligned_bytes res ={{1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,32}};

    clock_t t1, t2, t3,t4;

    nv12_to_icm2_1(&nv.aslonglong,&icm.aslonglong,32,2);
    assert(strncmp((const char*)icm.buf, (const char*)res.buf,32)==0);
    nv12_to_icm2_2(&nv.aslonglong,&icm.aslonglong,32,2);
    assert(strncmp((const char*)icm.buf, (const char*)res.buf,32)==0);
    nv12_to_icm2_v(&nv.aslonglong,&icm.aslonglong,32,2);
    assert(strncmp((const char*)icm.buf, (const char*)res.buf,32)==0);


    t1 = clock();
    for (unsigned long i=0; i<100000000; i++)
        nv12_to_icm2_v(&nv.aslonglong,&icm.aslonglong,32,2);
    t2 = clock();
    for (unsigned long i=0; i<100000000; i++)
        nv12_to_icm2_2(&nv.aslonglong,&icm.aslonglong,32,2);
    t3 = clock();
    for (unsigned long i=0; i<100000000; i++)
        nv12_to_icm2_2(&nv.aslonglong,&icm.aslonglong,32,2);
    t4 = clock();

    printf("vector: 100M in %lu (%.2fs)\n",t2-t1,(float)(t2-t1)/CLOCKS_PER_SEC);
    printf("union : 100M in %lu (%.2fs)\n",t3-t2,(float)(t3-t2)/CLOCKS_PER_SEC);
    printf("bitsh : 100M in %lu (%.2fs)\n",t4-t3,(float)(t4-t3)/CLOCKS_PER_SEC);


    return 0;
}
#endif