/*
 * Copyright © 2008 Nokia Corporation
 *
 * Permission to use, copy, modify, distribute and sell this software and its
 * documentation for any purpose is hereby granted without fee, provided that
 * the above copyright notice appear in all copies and that both that
 * copyright notice and this permission notice appear in supporting
 * documentation, and that the names of the authors and/or copyright holders
 * not be used in advertising or publicity pertaining to distribution of the
 * software without specific, written prior permission.  The authors and
 * copyright holders make no representations about the suitability of this
 * software for any purpose.  It is provided "as is" without any express
 * or implied warranty.
 *
 * THE AUTHORS AND COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO
 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS, IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR
 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
 * RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
 * CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 *
 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
 *
 * Portions based on fbpict.c
 */

#ifndef _FBARMV6_INTERNAL_H_
#define _FBARMV6_INTERNAL_H_

#include <stdint.h>

#ifdef USE_ARMV6

static inline uint32_t uqadd8(uint32_t a, uint32_t b)
{
    uint32_t result;
    asm (
        "uqadd8 %0, %1, %2 \n\t"
        : "=r" (result)
        : "r" (a), "r" (b)
    );
    return result;
}

#else

/* just a slow generic implementation, for testing purposes only */
static inline uint32_t uqadd8(uint32_t a, uint32_t b)
{
    uint32_t b0 = (a & 0xFF) + (b & 0xFF);
    uint32_t b1 = ((a >> 8) & 0xFF) + ((b >> 8) & 0xFF);
    uint32_t b2 = ((a >> 16) & 0xFF) + ((b >> 16) & 0xFF);
    uint32_t b3 = ((a >> 24) & 0xFF) + ((b >> 24) & 0xFF);
    b0 = (uint8_t) ((b0) | (0 - ((b0) >> 8)));
    b1 = (uint8_t) ((b1) | (0 - ((b1) >> 8)));
    b2 = (uint8_t) ((b2) | (0 - ((b2) >> 8)));
    b3 = (uint8_t) ((b3) | (0 - ((b3) >> 8)));
    return b0 | (b1 << 8) | (b2 << 16) | (b3 << 24);
}

#endif

/*
 * A 'reference' C code of optimized 'fbCompositeSolidMask_nx8x0565' function, the use
 * of assembly is minimal (restricted to the use of 'uqadd8' instruction only).
 * 
 * Note: 'dst_stride_delta' and 'mask_stride_delta' are not strides, but need
 *       be initialized with 'stride - width'
 */
static inline void fbCompositeSolidMask_nx8x0565_internal_mixed_armv6_c_helper(
    uint16_t *dst, uint8_t *mask, uint32_t src, int w,
    int dst_stride_delta, int mask_stride_delta, int h, const int opaquesrc)
{
    int backup_w = w;
    uint8_t a;
    uint32_t tmp1, tmp2, tmp3, tmp4, d;
    register const uint32_t unpacked_src_20 = src & 0x00FF00FF;
    register const uint32_t unpacked_src_31 = (src >> 8) & 0x00FF00FF;
    register const uint16_t src16 = ((src >> 3) & 0x001F) | ((src >> 5) & 0x07E0) | ((src >> 8) & 0xF800);
    while (--h >= 0)
    {
        while (--w >= 0)
        {
            a = *mask++;

            if (a == 0) { dst++; continue; }

            if (opaquesrc && a == 0xFF) { *dst++ = src16; continue; }

            tmp1 = unpacked_src_20 * a + 0x800080;
            tmp2 = unpacked_src_31 * a + 0x800080;
            d = *dst;
            tmp1 += (tmp1 & 0xFF00FFFF) >> 8;
            tmp2 += (tmp2 & 0xFF00FFFF) >> 8;

            a = ~tmp2 >> 24;

            tmp3 = (((d) << 3) & 0xF8) | (((d) << 8) & 0xF80000);
            tmp3 |= ((tmp3 >> 5) & 0x70007);

            tmp4 = (((d) >> 3) & 0xFC);
            tmp4 |= (tmp4 >> 6) & 0x3;

            tmp3 *= a;
            tmp4 *= a;
            tmp3 += 0x800080;
            tmp4 += 0x800080;
            tmp3 += (tmp3 & 0xFF00FFFF) >> 8;
            tmp4 += (tmp4 & 0xFF00FFFF) >> 8;

            tmp1 = uqadd8(tmp1, tmp3);
            tmp2 = uqadd8(tmp2, tmp4);

            d = ((tmp1 >> (3 + 8)) & 0x001F);
            d |= ((tmp1 >> (8 + 8)) & 0xF800);
            d |= (tmp2 >> 5) & 0x07E0;

            *dst++ = d;
        }
        w = backup_w;
        dst += dst_stride_delta;
        mask += mask_stride_delta;
    }
}

static inline void fbCompositeSolidMask_nx8x0565_internal_mixed_armv6_c(
    uint16_t *dst, uint8_t *mask, uint32_t src, int w,
    int dst_stride_delta, int mask_stride_delta, int h)
{
    if ((src >> 24) == 0xFF)
        fbCompositeSolidMask_nx8x0565_internal_mixed_armv6_c_helper(dst, mask, src, w,
            dst_stride_delta, mask_stride_delta, h, 1);
    else
        fbCompositeSolidMask_nx8x0565_internal_mixed_armv6_c_helper(dst, mask, src, w,
            dst_stride_delta, mask_stride_delta, h, 0);
}

#if defined(__ARM_EABI__) && defined(__linux__)
/*
 * ARMv6 assembly optimized version of 'fbCompositeSolidMask_nx8x0565'. It is
 * a bare metal 'naked' function which uses all the available CPU registers and
 * is compatible with ARM EABI. It might (or might not) break when used with a
 * different ABI, anyway better be safe than sorry.
 */
static void __attribute__((naked)) fbCompositeSolidMask_nx8x0565_internal_armv6(
    uint16_t *dst, uint8_t *mask, uint32_t src, int w,
    int dst_stride_delta, int mask_stride_delta, int h)
{
    asm volatile(
        ".macro fbCompositeSolidMask_nx8x0565_internal_armv6_asm opaque_flag\n"
            /* save all registers (8 words) to stack */
            "stmdb   sp!, {r4-r11, ip, lr}\n" 
            /* some register aliases for better readability */
            "DST     .req  r0\n"
            "MASK    .req  r1\n"
            "S       .req  r2\n"
            "W       .req  r3\n"
            "A       .req  r8\n"
            "D       .req  r10\n"
            "C0000FF .req  r11\n"
            "C00001F .req  r9\n"
            "C800080 .req  ip\n"
            "CE000E0 .req  lr\n"
            /* precalculate some stuff and put it on stack */
            "mov     r6, #0xF8\n"
            "mov     r7, #0xFC\n"

            "str     W, [sp, #-8]!\n"

            ".if \\opaque_flag\n"
                /* ((src >> 3) & 0x001F) | ((src >> 5) & 0x07E0) | ((src >> 8) & 0xF800) */
                /* precalculate and save it to stack for later use */
                "mov     A, #0x1F\n"
                "and     D, A, S, lsr #3\n"
                "and     r4, S, #0xF80000\n"
                "and     r5, S, #0xFC00\n"
                "orr     D, r4, lsr #8\n"
                "orr     D, r5, lsr #5\n"
                "str     D, [sp, #4]\n"
            ".endif\n"

            "ldr     D, [sp, #(8 + 10*4 + 8)]\n" /* h */
            "ldr     A, =0xFF00FF\n"
            "ldr     C800080, =0x800080\n"
            "ldr     CE000E0, =0xE000E0\n"
            "ldr     C0000FF, =0xFF\n"
            "ldr     C00001F, =0x1F\n"
            "and     r4, A, S\n"           /* r4 = src & 0x00FF00FF */
            "and     r5, A, S, lsr #8\n"   /* r5 = (src >> 8) & 0x00FF00FF */
            "stmdb   sp!, {r4, r5, r6, r7}\n"
        "0:\n"
            "subs    D, D, #1\n"
            "blt     6f\n"
        "1:\n"
            "subs    W, W, #1\n"
            "blt     5f\n"
        "2:\n"
            "ldrb    A, [MASK], #1\n"
            "ldmia   sp, {r4, r5, r6, r7}\n" /* load a set of constants from stack */
            "add     DST, DST, #2\n"
            "cmp     A, #0\n"
            "beq     1b\n"

            ".if \\opaque_flag\n"
                "cmp     A, #0xFF\n"
                "bne     3f\n"
                "ldr     D, [sp, #(4*4 + 4)]\n" /* load precalculated value */
                "subs    W, #1\n"
                "strh    D, [DST, #-2]\n"
                "bge     2b\n"
            ".endif\n"

        "3:\n"
            "ldrh    D, [DST, #-2]\n"
            "mla     r4, A, r4, C800080\n"
            "mla     r5, A, r5, C800080\n"
            "and     r6, r6, D, lsl #3\n" /* & 0xF8 */
            "and     r7, r7, D, lsr #3\n" /* & 0xFC */
            "and     D, D, #0xF800\n"
            "bic     S, r4, #0xFF0000\n"
            "bic     A, r5, #0xFF0000\n"
            "add     r4, r4, S, lsr #8\n"
            "add     r5, r5, A, lsr #8\n"

            "and     S, r7, #0xC0\n"
            "orr     r6, r6, D, lsl #8\n"
            "and     D, r6, CE000E0\n"
            "eor     A, C0000FF, r5, lsr #24\n"
            "orr     r6, D, lsr #5\n"
            "orr     r7, S, lsr #6\n"

            "mla     r6, A, r6, C800080\n"
            "mla     r7, A, r7, C800080\n"
            "subs    W, #1\n"
            "bic     D, r6, #0xFF0000\n"
            "bic     A, r7, #0xFF0000\n"
            "add     r6, r6, D, lsr #8\n"
            "uqadd8  r4, r4, r6\n"
            "add     r7, r7, A, lsr #8\n"
            "uqadd8  r5, r5, r7\n"
            "and     D, C00001F, r4, lsr #11\n"
            "and     r4, r4, #0xF8000000\n"
            "and     r5, r5, #0xFC00\n"
            "orr     D, r4, lsr #16\n"
            "orr     D, r5, lsr #5\n"
            "strh    D, [DST, #-2]\n"
            "bge     2b\n"
        "5:\n"
            "ldr     r6, [sp, #(4*4 + 8 + 10*4 + 8)]\n" /* h */
            "ldr     r4, [sp, #(4*4 + 8 + 10*4 + 4)]\n" /* mask stride */
            "ldr     r5, [sp, #(4*4 + 8 + 10*4 + 0)]\n" /* dst stride */
            "ldr     W, [sp, #(4*4)]\n"
            "subs    r6, r6, #1\n" /* h */
            "str     r6, [sp, #(4*4 + 8 + 10*4 + 8)]\n" /* h */
            "add     MASK, MASK, r4\n"
            "add     DST, DST, r5, lsl #1\n"
            "bgt     1b\n"
        "6:\n"
            "add     sp, sp, #(4*4 + 8)\n"
            "ldmia   sp!, {r4-r11, ip, pc}\n" /* restore all registers and return */
            ".unreq DST\n"
            ".unreq MASK\n"
            ".unreq S\n"
            ".unreq W\n"
            ".unreq A\n"
            ".unreq D\n"
            ".unreq C0000FF\n"
            ".unreq C00001F\n"
            ".unreq C800080\n"
            ".unreq CE000E0\n"
        ".endm\n"

        "mov     ip, r2, lsr #24\n"
        "cmp     ip, #0xFF\n"
        "beq     9f\n"
        "fbCompositeSolidMask_nx8x0565_internal_armv6_asm 0\n"
    "9:\n"
        "fbCompositeSolidMask_nx8x0565_internal_armv6_asm 1\n"
        ".ltorg\n"
    );
}

#else

#define fbCompositeSolidMask_nx8x0565_internal_armv6(a, b, c, d, e, f, g) \
    fbCompositeSolidMask_nx8x0565_internal_mixed_armv6_c((a), (b), (c), (d), (e), (f), (g))

#endif

/**
 * A 'reference' C code of optimized 'fbCompositeSrcAdd_8000x8000' function, the 
 * use of assembly is minimal (restricted to the use of 'uqadd8' instruction only).
 *
 * Code is implemented by processing two rows simultaneously for better handling 
 * of the common use cases (operations with fonts glyphs) where width is quite small
 * (typically only 6-13 pixels).
 */
static inline void fbCompositeSrcAdd_8000x8000_internal_mixed_armv6_c(
    uint8_t *dst, uint8_t *src, int w, int dst_stride,
    int src_stride, int h)
{
    int backup_w = w;
    uint32_t s1, s2, d1, d2;

    /* process two rows at once */
    while ((h -= 2) >= 0)
    {
        w = backup_w;
        if (w >= 3)
        {
            if ((uintptr_t)dst & 1)
            {
                s1 = *(uint8_t *)(src + src_stride);
                s2 = *(uint8_t *)src;
                src += 1;
                d1 = *(uint8_t *)(dst + dst_stride);
                d2 = *(uint8_t *)dst;
                d1 = uqadd8(d1, s1);
                d2 = uqadd8(d2, s2);
                *(uint8_t *)(dst + dst_stride) = d1;
                *(uint8_t *)(dst) = d2;
                dst += 1;
                w -= 1;
            }
            if ((uintptr_t)dst & 2)
            {
                s1 = *(uint16_t *)(src + src_stride);
                s2 = *(uint16_t *)src;
                src += 2;
                d1 = *(uint16_t *)(dst + dst_stride);
                d2 = *(uint16_t *)dst;
                d1 = uqadd8(d1, s1);
                d2 = uqadd8(d2, s2);
                *(uint16_t *)(dst + dst_stride) = d1;
                *(uint16_t *)(dst) = d2;
                dst += 2;
                w -= 2;
            }
        }

        while ((w -= 4) >= 0)
        {
            s1 = *(uint32_t *)(src + src_stride);
            s2 = *(uint32_t *)src;
            src += 4;
            d1 = *(uint32_t *)(dst + dst_stride);
            d2 = *(uint32_t *)dst;
            d1 = uqadd8(d1, s1);
            d2 = uqadd8(d2, s2);
            *(uint32_t *)(dst + dst_stride) = d1;
            *(uint32_t *)(dst) = d2;
            dst += 4;
        }

        if (w & 2)
        {
            s1 = *(uint16_t *)(src + src_stride);
            s2 = *(uint16_t *)src;
            src += 2;
            d1 = *(uint16_t *)(dst + dst_stride);
            d2 = *(uint16_t *)dst;
            d1 = uqadd8(d1, s1);
            d2 = uqadd8(d2, s2);
            *(uint16_t *)(dst + dst_stride) = d1;
            *(uint16_t *)(dst) = d2;
            dst += 2;
        }
        if (w & 1)
        {
            s1 = *(uint8_t *)(src + src_stride);
            s2 = *(uint8_t *)src;
            src += 1;
            d1 = *(uint8_t *)(dst + dst_stride);
            d2 = *(uint8_t *)dst;
            d1 = uqadd8(d1, s1);
            d2 = uqadd8(d2, s2);
            *(uint8_t *)(dst + dst_stride) = d1;
            *(uint8_t *)(dst) = d2;
            dst += 1;
        }

        src += src_stride * 2 - backup_w;
        dst += dst_stride * 2 - backup_w;
    }

    /* process the last remaining row if needed */
    if (h == -1)
    {
        w = backup_w;
        if (w >= 3)
        {
            if ((uintptr_t)dst & 1)
            {
                s2 = *(uint8_t *)src;
                src += 1;
                d2 = *(uint8_t *)dst;
                d2 = uqadd8(d2, s2);
                *(uint8_t *)(dst) = d2;
                dst += 1;
                w -= 1;
            }
            if ((uintptr_t)dst & 2)
            {
                s2 = *(uint16_t *)src;
                src += 2;
                d2 = *(uint16_t *)dst;
                d2 = uqadd8(d2, s2);
                *(uint16_t *)(dst) = d2;
                dst += 2;
                w -= 2;
            }
        }

        while ((w -= 4) >= 0)
        {
            s2 = *(uint32_t *)src;
            src += 4;
            d2 = *(uint32_t *)dst;
            d2 = uqadd8(d2, s2);
            *(uint32_t *)(dst) = d2;
            dst += 4;
        }

        if (w & 2)
        {
            s2 = *(uint16_t *)src;
            src += 2;
            d2 = *(uint16_t *)dst;
            d2 = uqadd8(d2, s2);
            *(uint16_t *)(dst) = d2;
            dst += 2;
        }
        if (w & 1)
        {
            s2 = *(uint8_t *)src;
            src += 1;
            d2 = *(uint8_t *)dst;
            d2 = uqadd8(d2, s2);
            *(uint8_t *)(dst) = d2;
            dst += 1;
        }
    }
}

/**
 * Conversion x8r8g8b8 -> r5g6b5
 *
 * TODO: implement an fully ARMv6 assembly optimized version of this function
 */
#define fbCompositeSrcAdd_8000x8000_internal_armv6(a, b, c, d, e, f) \
    fbCompositeSrcAdd_8000x8000_internal_mixed_armv6_c((a), (b), (c), (d), (e), (f))

static inline void fbComposite_x8r8g8b8_src_r5g6b5_internal_c(
    uint16_t *dst, uint32_t *src, int w, int dst_stride,
    int src_stride, int h)
{
    uint32_t a, b, s;
    int backup_w = w;
    while (h--)
    {
        w = backup_w;
        while (w--)
        {
            s = *src++;

            a = (s >> 3) & 0x1F001F;
            b = s & 0xFC00;
            a |= a >> 5;
            a |= b >> 5;

            *dst++ = a;
        }
        src += src_stride - backup_w;
        dst += dst_stride - backup_w;
    }
}

/**
 * Conversion x8r8g8b8 -> r5g6b5
 *
 * TODO: optimize more, eliminate stalls, try to use burst writes (4 words aligned 
 * at 16 byte boundary)
 */
static inline void fbComposite_x8r8g8b8_src_r5g6b5_internal_mixed_armv6_c(
    uint16_t *dst, uint32_t *src, int w, int dst_stride,
    int src_stride, int h)
{
    uint32_t a, x, y, c1F001F = 0x1F001F;
    int backup_w = w;
    while (h--)
    {
        w = backup_w;
        if (w > 0 && (uintptr_t)dst & 2)
        {
            x = *src++;

            a = (x >> 3) & c1F001F;
            x &= 0xFC00;
            a |= a >> 5;
            a |= x >> 5;

            *dst++ = a;
            w--;
        }

        asm volatile(
            "subs  %[w], %[w], #2\n"
            "blt   2f\n"
        "1:\n"
            "ldr   %[x], [%[src]], #4\n"
            "ldr   %[y], [%[src]], #4\n"
            "subs  %[w], %[w], #2\n"
            
            "and   %[a], %[c1F001F], %[x], lsr #3\n"
            "and   %[x], %[x], #0xFC00\n\n"
            "orr   %[a], %[a], %[a], lsr #5\n"
            "orr   %[x], %[a], %[x], lsr #5\n"

            "and   %[a], %[c1F001F], %[y], lsr #3\n"
            "and   %[y], %[y], #0xFC00\n\n"
            "orr   %[a], %[a], %[a], lsr #5\n"
            "orr   %[y], %[a], %[y], lsr #5\n"

            "pkhbt %[x], %[x], %[y], lsl #16\n"
            "str   %[x], [%[dst]], #4\n"
            "bge   1b\n"
        "2:\n"
        : [c1F001F] "+&r" (c1F001F), [src] "+&r" (src), [dst] "+&r" (dst), [a] "=&r" (a), 
          [x] "=&r" (x), [y] "=&r" (y), [w] "+&r" (w)
        );

        if (w & 1)
        {
            x = *src++;

            a = (x >> 3) & c1F001F;
            x = x & 0xFC00;
            a |= a >> 5;
            a |= x >> 5;

            *dst++ = a;
        }

        src += src_stride - backup_w;
        dst += dst_stride - backup_w;
    }
}

#if defined(__ARMEL__) && defined(__ARM_EABI__) && defined(__linux__)

/**
 * Conversion x8r8g8b8 -> r5g6b5
 *
 * Note: 'w' must be >= 7
 */
static void __attribute__((naked)) fbComposite_x8r8g8b8_src_r5g6b5_internal_armv6(
    uint16_t *dst, uint32_t *src, int w, int dst_stride,
    int src_stride, int h)
{
    asm volatile(
        /* define supplementary macros */
        ".macro cvt8888to565 PIX\n"
            "and   A, C1F001F, \\PIX, lsr #3\n"
            "and   \\PIX, \\PIX, #0xFC00\n\n"
            "orr   A, A, A, lsr #5\n"
            "orr   \\PIX, A, \\PIX, lsr #5\n"
        ".endm\n"

        ".macro combine_pixels_pair PIX1, PIX2\n"
            "pkhbt \\PIX1, \\PIX1, \\PIX2, lsl #16\n" /* Note: assume little endian byte order */
        ".endm\n"

        /* function entry, save all registers (10 words) to stack */
        "stmdb   sp!, {r4-r11, ip, lr}\n"
        
        /* define some aliases */
        "DST     .req  r0\n"
        "SRC     .req  r1\n"
        "W       .req  r2\n"
        "H       .req  r3\n"

        "TMP1    .req  r4\n"
        "TMP2    .req  r5\n"
        "TMP3    .req  r6\n"
        "TMP4    .req  r7\n"
        "TMP5    .req  r8\n"
        "TMP6    .req  r9\n"
        "TMP7    .req  r10\n"
        "TMP8    .req  r11\n"

        "C1F001F .req  ip\n"
        "A       .req  lr\n"
        
        "ldr     TMP1, [sp, #(10*4+0)]\n" /* load src_stride */
        "ldr     C1F001F, =0x1F001F\n"
        "sub     r3, r3, W\n"
        "str     r3, [sp, #(10*4+0)]\n" /* store (dst_stride-w) */
        "ldr     r3, [sp, #(10*4+4)]\n" /* load h */
        "sub     TMP1, TMP1, W\n"
        "str     TMP1, [sp, #(10*4+4)]\n" /* store (src_stride-w) */
        
        "str     W, [sp, #(8*4)]\n" /* saved ip = W */

    "0:\n"
        "subs    H, H, #1\n"
        "blt     6f\n"
    "1:\n"
        /* align DST at 4 byte boundary */
        "tst     DST, #2\n"
        "beq     2f\n"
        "ldr     TMP1, [SRC], #4\n"
        "sub     W, W, #1\n"
        "cvt8888to565 TMP1\n"
        "strh    TMP1, [DST], #2\n"
    "2:"
        /* align DST at 8 byte boundary */
        "tst     DST, #4\n"
        "beq     2f\n"
        "ldmia   SRC!, {TMP1, TMP2}\n"
        "sub     W, W, #2\n"
        "cvt8888to565 TMP1\n"
        "cvt8888to565 TMP2\n"
        "combine_pixels_pair TMP1, TMP2\n"
        "str     TMP1, [DST], #4\n"
    "2:"
        /* align DST at 16 byte boundary */
        "tst     DST, #8\n"
        "beq     2f\n"
        "ldmia   SRC!, {TMP1, TMP2, TMP3, TMP4}\n"
        "sub     W, W, #4\n"
        "cvt8888to565 TMP1\n"
        "cvt8888to565 TMP2\n"
        "cvt8888to565 TMP3\n"
        "cvt8888to565 TMP4\n"
        "combine_pixels_pair TMP1, TMP2\n"
        "combine_pixels_pair TMP3, TMP4\n"
        "stmia DST!, {TMP1, TMP3}\n"
    "2:"
        /* inner loop, process 8 pixels per iteration */
        "subs    W, W, #8\n"
        "blt     4f\n"
    "3:\n"
        "ldmia   SRC!, {TMP1, TMP2, TMP3, TMP4, TMP5, TMP6, TMP7, TMP8}\n"
        "subs    W, W, #8\n"
        "cvt8888to565 TMP1\n"
        "cvt8888to565 TMP2\n"
        "cvt8888to565 TMP3\n"
        "cvt8888to565 TMP4\n"
        "cvt8888to565 TMP5\n"
        "cvt8888to565 TMP6\n"
        "cvt8888to565 TMP7\n"
        "cvt8888to565 TMP8\n"
        "combine_pixels_pair TMP1, TMP2\n"
        "combine_pixels_pair TMP3, TMP4\n"
        "combine_pixels_pair TMP5, TMP6\n"
        "combine_pixels_pair TMP7, TMP8\n"
        "stmia   DST!, {TMP1, TMP3, TMP5, TMP7}\n"
        "bge     3b\n"
    "4:\n"

        /* process the remaining pixels */
        "tst     W, #4\n"
        "beq     4f\n"
        "ldmia   SRC!, {TMP1, TMP2, TMP3, TMP4}\n"
        "cvt8888to565 TMP1\n"
        "cvt8888to565 TMP2\n"
        "cvt8888to565 TMP3\n"
        "cvt8888to565 TMP4\n"
        "combine_pixels_pair TMP1, TMP2\n"
        "combine_pixels_pair TMP3, TMP4\n"
        "stmia   DST!, {TMP1, TMP3}\n"
    "4:\n"
        "tst     W, #2\n"
        "beq     4f\n"
        "ldmia   SRC!, {TMP1, TMP2}\n"
        "cvt8888to565 TMP1\n"
        "cvt8888to565 TMP2\n"
        "combine_pixels_pair TMP1, TMP2\n"
        "str     TMP1, [DST], #4\n"
    "4:\n"
        "tst     W, #1\n"
        "beq     4f\n"
        "ldr     TMP1, [SRC], #4\n"
        "cvt8888to565 TMP1\n"
        "strh    TMP1, [DST], #2\n"
    "4:\n"
        "ldr     TMP1, [sp, #(10*4+0)]\n" /* (dst_stride-w) */
        "ldr     TMP2, [sp, #(10*4+4)]\n" /* (src_stride-w) */
        "ldr     W, [sp, #(8*4)]\n"
        "subs    H, H, #1\n"
        "add     DST, DST, TMP1, lsl #1\n"
        "add     SRC, SRC, TMP2, lsl #2\n"
        "bge     1b\n"
    "6:\n"
        "ldmia   sp!, {r4-r11, ip, pc}\n" /* restore all registers and return */
        ".ltorg\n"

        ".unreq   DST\n"
        ".unreq   SRC\n"
        ".unreq   W\n"
        ".unreq   H\n"

        ".unreq   TMP1\n"
        ".unreq   TMP2\n"
        ".unreq   TMP3\n"
        ".unreq   TMP4\n"
        ".unreq   TMP5\n"
        ".unreq   TMP6\n"
        ".unreq   TMP7\n"
        ".unreq   TMP8\n"

        ".unreq   C1F001F\n"
        ".unreq   A\n"

        ".purgem  cvt8888to565\n"
        ".purgem  combine_pixels_pair\n"
    );
}

#else

#define fbComposite_x8r8g8b8_src_r5g6b5_internal_armv6(a, b, c, d, e, f) \
    fbComposite_x8r8g8b8_src_r5g6b5_internal_mixed_armv6_c((a), (b), (c), (d), (e), (f))

#endif

#endif
