/*
 * Copyright © 2007 Siarhei Siamashka
 * Copyright © 2008 Nokia Corporation
 *
 * Permission to use, copy, modify, distribute and sell this software and its
 * documentation for any purpose is hereby granted without fee, provided that
 * the above copyright notice appear in all copies and that both that
 * copyright notice and this permission notice appear in supporting
 * documentation, and that the names of the authors and/or copyright holders
 * not be used in advertising or publicity pertaining to distribution of the
 * software without specific, written prior permission.  The authors and
 * copyright holders make no representations about the suitability of this
 * software for any purpose.  It is provided "as is" without any express
 * or implied warranty.
 *
 * THE AUTHORS AND COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO
 * THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
 * FITNESS, IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR
 * ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER
 * RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF
 * CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 *
 * Author: Siarhei Siamashka <siarhei.siamashka@nokia.com>
 */

/*
 * ARMv6 assembly optimized color format conversion functions (planar YV12 to
 * some custom YUV420 format used by graphics chip in Nokia N800/N810)
 */
        .text

.macro YUV420_function_template function_name, USE_PLD, USE_ARMV6

        .align
        .global \function_name
        .func \function_name
\function_name:

#define DST     r0
#define SRC_Y   r1
#define SRC_U   r2
#define WIDTH   r3
#define TMP1    r10
#define TMP2    r11
#define TMP3    lr

/* Read information about 4 pixels, convert them to YUV420 and store into 6 bytes using 16-bit writes */
.macro  CONVERT_4_PIXELS_MACROBLOCK
        ldrb    r4, [SRC_Y], #1
        ldrb    TMP1, [SRC_U], #1
        ldrb    r5, [SRC_U], #1
        ldrb    TMP2, [SRC_Y], #1
        ldrb    r6, [SRC_Y, #1]
        ldrb    TMP3, [SRC_Y], #2
        add     r4, r4, TMP1, lsl #8
        add     r5, r5, TMP2, lsl #8
        add     r6, r6, TMP3, lsl #8
        strh    r4, [DST], #2
        strh    r5, [DST], #2
        strh    r6, [DST], #2
.endm

/*
 * CONVERT_8_PIXELS_MACROBLOCK_*  macros are the "building bricks" from which
 * the main loop gets constructed.
 *
 * CONVERT_8_PIXELS_MACROBLOCK_1 prepares the first 32-bit output value
 * for 8 pixels macroblock (stored to the register that is specified as the
 * first macro argument)
 *
 * CONVERT_8_PIXELS_MACROBLOCK_2 prepares the second 32-bit output value
 * for 8 pixels macroblock (stored to the register that is specified as the
 * first macro argument)
 *
 * CONVERT_8_PIXELS_MACROBLOCK_3 prepares the third 32-bit output value
 * for 8 pixels macroblock (stored to the register that is specified as the
 * first macro argument)
 *
 * After using all three macros sequentially, we get the converted pixel data
 * for 8 pixels in three 32-bit registers. The only thing left to do is
 * just store them to the destination buffer (preferably using store-multiple
 * instruction for better performance). Input data is read from SRC_Y and SRC_U
 * pointers and they are advanced forward as needed.
 */

.if \USE_ARMV6

/*
 * Here is the explanation for the extra arguments (the first argument is
 * the destination register for converted data) of CONVERT_8_PIXELS_MACROBLOCK_*
 * macros.
 *
 * PLD_FLAG selects if we want to insert cache preload instruction in the
 * code flow of the macro
 *
 * FLAG1 and FLAG2 arguments are used to be able to "fuse" several macros
 * together when they are used back-to-back and reorder instructions
 * to avoid ARM11 pipeline stalls. When FLAG2 is set to 1, we need to
 * have another CONVERT_8_PIXELS_MACROBLOCK_* macro following with
 * FLAG1 set to 1 (and DST_REG2 from the first macro should be the same
 * as DST_REG1 from the first macro). In a long sequence of
 * CONVERT_8_PIXELS_MACROBLOCK_*, the first macro should always
 * have FLAG1 set to 0 and the last one should have FLAG2 set to 0.
 *
 * Example: the following two code fragments do the same job, but the
 * second one is faster because of "fusing":
 *
 * 1)
 * CONVERT_8_PIXELS_MACROBLOCK_1 r4, r6, 0, 0, 0
 * CONVERT_8_PIXELS_MACROBLOCK_2 r6, r7, 0, 0, 0
 *
 * 2)
 * CONVERT_8_PIXELS_MACROBLOCK_1 r4, r6, 0, 1, 0
 * CONVERT_8_PIXELS_MACROBLOCK_2 r6, r7, 1, 0, 0
 *
 *
 * Also see non-ARMv6 implementation below for better understanding as
 * it does the same but is much simplier ("fusing" is not required)
 */

.macro  CONVERT_8_PIXELS_MACROBLOCK_1 DST_REG1, DST_REG2, FLAG1, FLAG2, PLD_FLAG
.if \FLAG1 == 0
        ldrb    \DST_REG1, [SRC_U], #1
        ldrh    TMP1, [SRC_Y], #2
        ldrb    TMP2, [SRC_U], #1
.endif
.if \FLAG2 == 1
        ldrh    \DST_REG2, [SRC_Y], #2
.endif
.if \PLD_FLAG == 1
        pld     [SRC_Y, #48]
.endif
        add     \DST_REG1, \DST_REG1, TMP1, lsl #8
        add     \DST_REG1, \DST_REG1, TMP2, lsl #24
.if \FLAG2 == 1
        ldrb    TMP1, [SRC_U], #1
        ldrb    TMP2, [SRC_Y], #1
.endif
        rev16   \DST_REG1, \DST_REG1
.endm

.macro  CONVERT_8_PIXELS_MACROBLOCK_2 DST_REG1, DST_REG2, FLAG1, FLAG2, DUMMY1
.if \FLAG1 == 0
        ldrh    \DST_REG1, [SRC_Y], #2
        ldrb    TMP1, [SRC_U], #1
        ldrb    TMP2, [SRC_Y], #1
.endif
.if \FLAG2 == 1
        ldrb    \DST_REG2, [SRC_Y], #1
.endif
        add     \DST_REG1, \DST_REG1, TMP1, lsl #16
        add     \DST_REG1, \DST_REG1, TMP2, lsl #24
.if \FLAG2 == 1
        ldrb    TMP1, [SRC_U], #1
        ldrh    TMP2, [SRC_Y], #2
.endif
        rev16   \DST_REG1, \DST_REG1
.endm

.macro  CONVERT_8_PIXELS_MACROBLOCK_3 DST_REG1, DST_REG2, FLAG1, FLAG2, DUMMY1
.if \FLAG1 == 0
        ldrb    \DST_REG1, [SRC_Y], #1
        ldrb    TMP1, [SRC_U], #1
        ldrh    TMP2, [SRC_Y], #2
.endif
.if \FLAG2 == 1
        ldrb    \DST_REG2, [SRC_U], #1
.endif
        add     \DST_REG1, \DST_REG1, TMP1, lsl #8
        add     \DST_REG1, \DST_REG1, TMP2, lsl #16
.if \FLAG2 == 1
        ldrh    TMP1, [SRC_Y], #2
        ldrb    TMP2, [SRC_U], #1
.endif
        rev16   \DST_REG1, \DST_REG1
.endm

.else

/*
 * ARMv4-compatible implementation of CONVERT_8_PIXELS_MACROBLOCK_* macros.
 * Extra arguments are not used and ignored (except for PLD_FLAG which enables
 * prefetch and may be useful for some ARMv5TE compatible cores).
 */

.macro  CONVERT_8_PIXELS_MACROBLOCK_1 DST_REG, DUMMY1, DUMMY2, DUMMY3, PLD_FLAG
        ldrb    \DST_REG, [SRC_Y], #1
        ldrb    TMP1, [SRC_U], #1
        ldrb    TMP2, [SRC_U], #1
        ldrb    TMP3, [SRC_Y], #1
.if \USE_PLD && (\PLD_FLAG == 1)
        pld     [SRC_Y, #48]
.endif
        add     \DST_REG, \DST_REG, TMP1, lsl #8
        add     \DST_REG, \DST_REG, TMP2, lsl #16
        add     \DST_REG, \DST_REG, TMP3, lsl #24
.endm

.macro  CONVERT_8_PIXELS_MACROBLOCK_2 DST_REG, DUMMY1, DUMMY2, DUMMY3, DUMMY4
        ldrb    \DST_REG, [SRC_Y, #1]
        ldrb    TMP1, [SRC_Y], #2
        ldrb    TMP2, [SRC_Y], #1
        ldrb    TMP3, [SRC_U], #1
        add     \DST_REG, \DST_REG, TMP1, lsl #8
        add     \DST_REG, \DST_REG, TMP2, lsl #16
        add     \DST_REG, \DST_REG, TMP3, lsl #24
.endm

.macro  CONVERT_8_PIXELS_MACROBLOCK_3 DST_REG, DUMMY1, DUMMY2, DUMMY3, DUMMY4
        ldrb    \DST_REG, [SRC_U], #1
        ldrb    TMP1, [SRC_Y], #1
        ldrb    TMP2, [SRC_Y, #1]
        ldrb    TMP3, [SRC_Y], #2
        add     \DST_REG, \DST_REG, TMP1, lsl #8
        add     \DST_REG, \DST_REG, TMP2, lsl #16
        add     \DST_REG, \DST_REG, TMP3, lsl #24
.endm

.endif

/*
 * All the supplementary macros are now defined. Here is the actual function
 * entry point
 */

.if \USE_PLD
        pld     [SRC_Y]
.endif
        stmfd   sp!, {r4-r8, r10-r11, lr}

        /* Destination buffer should be at least 16-bit aligned, image width should be multiple of 4 */
        bic     DST, #1
        bic     WIDTH, #3

        /* Ensure 32-bit alignment of the destination buffer */
        tst     DST, #2
        beq     1f
        subs    WIDTH, #4
        blt     6f
        CONVERT_4_PIXELS_MACROBLOCK
1:
        subs    WIDTH, #32
        blt     3f
2:      /* Convert 32 pixels per loop iteration */
        CONVERT_8_PIXELS_MACROBLOCK_1 r4, r6, 0, 1, 1 /* Also do cache preload for SRC_Y */
        CONVERT_8_PIXELS_MACROBLOCK_2 r6, r7, 1, 1, 0
        CONVERT_8_PIXELS_MACROBLOCK_3 r7, r8, 1, 1, 0
        CONVERT_8_PIXELS_MACROBLOCK_1 r8, r5, 1, 1, 0
        stmia   DST!, {r4, r6, r7, r8}

        subs    WIDTH, #32

        CONVERT_8_PIXELS_MACROBLOCK_2 r5, r6, 1, 1, 0
        CONVERT_8_PIXELS_MACROBLOCK_3 r6, r7, 1, 1, 0
        CONVERT_8_PIXELS_MACROBLOCK_1 r7, r8, 1, 1, 0
        CONVERT_8_PIXELS_MACROBLOCK_2 r8, r4, 1, 1, 0
        stmia   DST!, {r5, r6, r7, r8}
.if \USE_PLD
        /* Do cache preload for SRC_U */
        pld     [SRC_U, #48]
.endif
        CONVERT_8_PIXELS_MACROBLOCK_3 r4, r6, 1, 1, 0
        CONVERT_8_PIXELS_MACROBLOCK_1 r6, r7, 1, 1, 0
        CONVERT_8_PIXELS_MACROBLOCK_2 r7, r8, 1, 1, 0
        CONVERT_8_PIXELS_MACROBLOCK_3 r8, r4, 1, 0, 0
        stmia   DST!, {r4, r6, r7, r8}

        bge     2b
3:
        adds    WIDTH, WIDTH, #32
        ble     6f

        subs    WIDTH, WIDTH, #8
        blt     5f
4:      /* Convert remaining pixels processing them 8 per iteration */
        CONVERT_8_PIXELS_MACROBLOCK_1 r4, r5, 0, 1, 0
        CONVERT_8_PIXELS_MACROBLOCK_2 r5, r6, 1, 1, 0
        CONVERT_8_PIXELS_MACROBLOCK_3 r6, r7, 1, 0, 0
        stmia   DST!, {r4-r6}
        subs    WIDTH, WIDTH, #8
        bge     4b
5:      /* Convert the last 4 pixels if needed */
        adds    WIDTH, WIDTH, #8
        ble     6f
        CONVERT_4_PIXELS_MACROBLOCK
        subs    WIDTH, #4
        bgt     4b
6:      /* Restore all registers and return */
        ldmfd  sp!, {r4-r8, r10-r11, pc}

.purgem CONVERT_4_PIXELS_MACROBLOCK
.purgem CONVERT_8_PIXELS_MACROBLOCK_1
.purgem CONVERT_8_PIXELS_MACROBLOCK_2
.purgem CONVERT_8_PIXELS_MACROBLOCK_3

#undef  DST
#undef  SRC_Y
#undef  SRC_U
#undef  WIDTH
#undef  TMP1
#undef  TMP2
#undef  TMP3

        .endfunc

.endm

/* Instantiate ARMv6 optimized function from the template macro */
YUV420_function_template yv12_to_yuv420_line_armv6, 1, 1
