• Main Page
  • Related Pages
  • Namespaces
  • Classes
  • Files
  • File List
  • File Members

src/processing/Demosaic_ARM.cpp

00001 #ifdef FCAM_ARCH_ARM
00002 #include "Demosaic_ARM.h"
00003 #include <arm_neon.h>
00004 
00005 namespace FCam {
00006 
00007     // Make a linear luminance -> pixel value lookup table
00008     extern void makeLUT(const Frame &f, float contrast, int blackLevel, float gamma, unsigned char *lut);
00009 
00010     Image demosaic_ARM(Frame src, float contrast, bool denoise, int blackLevel, float gamma) {
00011 
00012         const int BLOCK_WIDTH  = 40;
00013         const int BLOCK_HEIGHT = 24;
00014 
00015         Image input = src.image();
00016 
00017         // Check we're the right bayer pattern. If not crop and continue.
00018         switch((int)src.bayerPattern()) {
00019         case GRBG:
00020             break;
00021         case RGGB:
00022             input = input.subImage(1, 0, Size(input.width()-2, input.height()));
00023             break;
00024         case BGGR:
00025             input = input.subImage(0, 1, Size(input.width(), input.height()-2));
00026             break;
00027         case GBRG:
00028             input = input.subImage(1, 1, Size(input.width()-2, input.height()-2));
00029         default:
00030             error(Event::DemosaicError, "Can't demosaic from a non-bayer sensor\n");
00031             return Image();
00032         }       
00033 
00034         int rawWidth = input.width();
00035         int rawHeight = input.height();
00036 
00037         const int VEC_WIDTH = ((BLOCK_WIDTH + 8)/8);
00038         const int VEC_HEIGHT = ((BLOCK_HEIGHT + 8)/2);       
00039 
00040         int rawPixelsPerRow = input.bytesPerRow()/2 ; // Assumes bytesPerRow is even
00041 
00042         int outWidth = rawWidth-8;
00043         int outHeight = rawHeight-8;
00044         outWidth /= BLOCK_WIDTH;
00045         outWidth *= BLOCK_WIDTH;
00046         outHeight /= BLOCK_HEIGHT;
00047         outHeight *= BLOCK_HEIGHT;
00048 
00049         Image out(outWidth, outHeight, RGB24);
00050                 
00051         // Check we're the right size, if not, crop center
00052         if (((input.width() - 8) != (unsigned)outWidth) ||
00053             ((input.height() - 8) != (unsigned)outHeight)) { 
00054             int offX = (input.width() - 8 - outWidth)/2;
00055             int offY = (input.height() - 8 - outHeight)/2;
00056             offX -= offX&1;
00057             offY -= offY&1;
00058             
00059             if (offX || offY) {
00060                 input = input.subImage(offX, offY, Size(outWidth+8, outHeight+8));
00061             }
00062         }           
00063         
00064         Time startTime = Time::now(); 
00065 
00066         // Prepare the color matrix in S8.8 fixed point
00067         float colorMatrix_f[12];
00068         
00069         src.rawToRGBColorMatrix((float *)colorMatrix_f);
00070 
00071         int16x4_t colorMatrix[3];
00072         for (int i = 0; i < 3; i++) {
00073             int16_t val = (int16_t)(colorMatrix_f[i*4+0] * 256 + 0.5);
00074             colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 0);
00075             val = (int16_t)(colorMatrix_f[i*4+1] * 256 + 0.5);
00076             colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 1);
00077             val = (int16_t)(colorMatrix_f[i*4+2] * 256 + 0.5);
00078             colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 2);
00079             val = (int16_t)(colorMatrix_f[i*4+3] * 256 + 0.5);
00080             colorMatrix[i] = vld1_lane_s16(&val, colorMatrix[i], 3);
00081         }
00082 
00083         // A buffer to store data after demosiac and color correction
00084         // but before gamma correction
00085         uint16_t out16[BLOCK_WIDTH*BLOCK_HEIGHT*3];
00086 
00087         // Various color channels. Only 4 of them are defined before
00088         // demosaic, all of them are defined after demosiac
00089         int16_t scratch[VEC_WIDTH*VEC_HEIGHT*4*12];
00090 
00091         #define R_R_OFF  (VEC_WIDTH*VEC_HEIGHT*4*0)
00092         #define R_GR_OFF (VEC_WIDTH*VEC_HEIGHT*4*1)
00093         #define R_GB_OFF (VEC_WIDTH*VEC_HEIGHT*4*2)
00094         #define R_B_OFF  (VEC_WIDTH*VEC_HEIGHT*4*3)
00095 
00096         #define G_R_OFF  (VEC_WIDTH*VEC_HEIGHT*4*4)
00097         #define G_GR_OFF (VEC_WIDTH*VEC_HEIGHT*4*5)
00098         #define G_GB_OFF (VEC_WIDTH*VEC_HEIGHT*4*6)
00099         #define G_B_OFF  (VEC_WIDTH*VEC_HEIGHT*4*7)
00100 
00101         #define B_R_OFF  (VEC_WIDTH*VEC_HEIGHT*4*8)
00102         #define B_GR_OFF (VEC_WIDTH*VEC_HEIGHT*4*9)
00103         #define B_GB_OFF (VEC_WIDTH*VEC_HEIGHT*4*10)
00104         #define B_B_OFF  (VEC_WIDTH*VEC_HEIGHT*4*11)
00105 
00106         #define R_R(i)  (scratch+(i)+R_R_OFF)
00107         #define R_GR(i) (scratch+(i)+R_GR_OFF)
00108         #define R_GB(i) (scratch+(i)+R_GB_OFF)
00109         #define R_B(i)  (scratch+(i)+R_B_OFF)
00110 
00111         #define G_R(i)  (scratch+(i)+G_R_OFF)
00112         #define G_GR(i) (scratch+(i)+G_GR_OFF)
00113         #define G_GB(i) (scratch+(i)+G_GB_OFF)
00114         #define G_B(i)  (scratch+(i)+G_B_OFF)
00115 
00116         #define B_R(i)  (scratch+(i)+B_R_OFF)
00117         #define B_GR(i) (scratch+(i)+B_GR_OFF)
00118         #define B_GB(i) (scratch+(i)+B_GB_OFF)
00119         #define B_B(i)  (scratch+(i)+B_B_OFF)
00120 
00121         // Reuse some of the output scratch area for the noisy inputs
00122         #define G_GR_NOISY B_GR
00123         #define B_B_NOISY  G_B
00124         #define R_R_NOISY  G_R
00125         #define G_GB_NOISY B_GB
00126 
00127         // Prepare the lookup table
00128         unsigned char lut[4096];
00129         makeLUT(src, contrast, blackLevel, gamma, lut);
00130 
00131         // For each block in the input
00132         for (int by = 0; by < rawHeight-8-BLOCK_HEIGHT+1; by += BLOCK_HEIGHT) {
00133             const short * __restrict__ blockPtr = (const short *)input(0,by);
00134             unsigned char * __restrict__ outBlockPtr = out(0, by);
00135             for (int bx = 0; bx < rawWidth-8-BLOCK_WIDTH+1; bx += BLOCK_WIDTH) {                
00136 
00137                 // Stage 1) Demux a block of input into L1
00138                 if (1) {
00139                     register const int16_t * __restrict__ rawPtr = blockPtr;
00140                     register const int16_t * __restrict__ rawPtr2 = blockPtr + rawPixelsPerRow;
00141 
00142                     register const int rawJump = rawPixelsPerRow*2 - VEC_WIDTH*8;
00143 
00144                     register int16_t * __restrict__ g_gr_ptr = denoise ? G_GR_NOISY(0) : G_GR(0);
00145                     register int16_t * __restrict__ r_r_ptr  = denoise ? R_R_NOISY(0)  : R_R(0);
00146                     register int16_t * __restrict__ b_b_ptr  = denoise ? B_B_NOISY(0)  : B_B(0);
00147                     register int16_t * __restrict__ g_gb_ptr = denoise ? G_GB_NOISY(0) : G_GB(0);
00148 
00149                     for (int y = 0; y < VEC_HEIGHT; y++) {
00150                         for (int x = 0; x < VEC_WIDTH/2; x++) {
00151 
00152                             asm volatile ("# Stage 1) Demux\n");
00153                             
00154                             // The below needs to be volatile, but
00155                             // it's not clear why (if it's not, it
00156                             // gets optimized out entirely)
00157                             asm volatile (
00158                                  "vld2.16  {d6-d9}, [%[rawPtr]]! \n\t"
00159                                  "vld2.16  {d10-d13}, [%[rawPtr2]]! \n\t"
00160                                  "vst1.16  {d6-d7}, [%[g_gr_ptr]]! \n\t"
00161                                  "vst1.16  {d8-d9}, [%[r_r_ptr]]! \n\t"
00162                                  "vst1.16  {d10-d11}, [%[b_b_ptr]]! \n\t"
00163                                  "vst1.16  {d12-d13}, [%[g_gb_ptr]]! \n\t" :
00164                                  [rawPtr]"+r"(rawPtr), 
00165                                  [rawPtr2]"+r"(rawPtr2),
00166                                  [g_gr_ptr]"+r"(g_gr_ptr),
00167                                  [r_r_ptr]"+r"(r_r_ptr),
00168                                  [b_b_ptr]"+r"(b_b_ptr),
00169                                  [g_gb_ptr]"+r"(g_gb_ptr) ::
00170                                  "d6", "d7", "d8", "d9", "d10", "d11", "d12", "d13", "memory");
00171                             
00172                         }
00173 
00174                         rawPtr += rawJump;
00175                         rawPtr2 += rawJump;
00176                     }               
00177                 }
00178 
00179                 // Stage 1.5) Denoise sensor input (noisy pixel supression)
00180 
00181                 // A pixel can't be brighter than its brightest neighbor
00182 
00183                 if (denoise) {
00184                     register int16_t * __restrict__ ptr_in = NULL;
00185                     register int16_t * __restrict__ ptr_out = NULL;
00186                     asm volatile("#Stage 1.5: Denoise\n\t");
00187                     for (int b=0; b<4; b++) {
00188                         if (b==0) ptr_in = G_GR_NOISY(0);
00189                         if (b==1) ptr_in = R_R_NOISY(0);
00190                         if (b==2) ptr_in = B_B_NOISY(0);
00191                         if (b==3) ptr_in = G_GB_NOISY(0);
00192                         if (b==0) ptr_out = G_GR(0);
00193                         if (b==1) ptr_out = R_R(0);
00194                         if (b==2) ptr_out = B_B(0);
00195                         if (b==3) ptr_out = G_GB(0);
00196 
00197                         // write the top block pixels who aren't being denoised
00198                         for (int x = 0; x < (BLOCK_WIDTH+8); x+=8) {
00199                             int16x8_t in = vld1q_s16(ptr_in);
00200                             vst1q_s16(ptr_out, in);
00201                             ptr_in+=8;
00202                             ptr_out+=8;
00203                         }
00204 
00205                         for (int y = 1; y < VEC_HEIGHT - 1; y++) {
00206                             for (int x = 0; x < VEC_WIDTH/2; x++) {
00207                                 int16x8_t here  = vld1q_s16(ptr_in);
00208                                 int16x8_t above = vld1q_s16(ptr_in + VEC_WIDTH*4);
00209                                 int16x8_t under = vld1q_s16(ptr_in - VEC_WIDTH*4);
00210                                 int16x8_t right = vld1q_s16(ptr_in + 1);
00211                                 int16x8_t left  = vld1q_s16(ptr_in - 1);
00212                                 int16x8_t max, min;
00213 
00214                                 // find the max and min of the neighbors
00215                                 max = vmaxq_s16(left, right);
00216                                 max = vmaxq_s16(above, max);
00217                                 max = vmaxq_s16(under, max);
00218 
00219                                 min = vminq_s16(left, right);
00220                                 min = vminq_s16(above, min);
00221                                 min = vminq_s16(under, min);                               
00222 
00223                                 // clamp here to be within this range
00224                                 here  = vminq_s16(max, here);
00225                                 here  = vmaxq_s16(min, here);
00226 
00227                                 vst1q_s16(ptr_out, here);
00228                                 ptr_in += 8;
00229                                 ptr_out += 8;
00230                             }
00231                         }
00232 
00233                         // write the bottom block pixels who aren't being denoised
00234                         for (int x = 0; x < (BLOCK_WIDTH+8); x+=8) {
00235                             int16x8_t in = vld1q_s16(ptr_in);
00236                             vst1q_s16(ptr_out, in);
00237                             ptr_in+=8;
00238                             ptr_out+=8;
00239                         }
00240                     }
00241                 }
00242 
00243                 // Stage 2 and 3) Do horizontal and vertical
00244                 // interpolation of green, as well as picking the
00245                 // output for green
00246                 /*
00247                   gv_r = (gb[UP] + gb[HERE])/2;
00248                   gvd_r = (gb[UP] - gb[HERE]);
00249                   gh_r = (gr[HERE] + gr[RIGHT])/2;
00250                   ghd_r = (gr[HERE] - gr[RIGHT]);                 
00251                   g_r = ghd_r < gvd_r ? gh_r : gv_r;
00252                   
00253                   gv_b = (gr[DOWN] + gr[HERE])/2;
00254                   gvd_b = (gr[DOWN] - gr[HERE]);                  
00255                   gh_b = (gb[LEFT] + gb[HERE])/2;
00256                   ghd_b = (gb[LEFT] - gb[HERE]);                  
00257                   g_b = ghd_b < gvd_b ? gh_b : gv_b;
00258                 */
00259                 if (1) {
00260                 
00261                     int i = VEC_WIDTH*4;
00262 
00263                     register int16_t *g_gb_up_ptr = G_GB(i) - VEC_WIDTH*4;
00264                     register int16_t *g_gb_here_ptr = G_GB(i);
00265                     register int16_t *g_gb_left_ptr = G_GB(i) - 1;
00266                     register int16_t *g_gr_down_ptr = G_GR(i) + VEC_WIDTH*4;
00267                     register int16_t *g_gr_here_ptr = G_GR(i);
00268                     register int16_t *g_gr_right_ptr = G_GR(i) + 1;
00269                     register int16_t *g_r_ptr = G_R(i);
00270                     register int16_t *g_b_ptr = G_B(i);
00271             
00272                     for (int y = 1; y < VEC_HEIGHT-1; y++) {
00273                         for (int x = 0; x < VEC_WIDTH/2; x++) {
00274 
00275                             asm volatile ("#Stage 2) Green interpolation\n");
00276 
00277                             // Load the inputs
00278 
00279                             int16x8_t gb_up = vld1q_s16(g_gb_up_ptr);
00280                             g_gb_up_ptr+=8;
00281                             int16x8_t gb_here = vld1q_s16(g_gb_here_ptr);
00282                             g_gb_here_ptr+=8;
00283                             int16x8_t gb_left = vld1q_s16(g_gb_left_ptr); // unaligned
00284                             g_gb_left_ptr+=8;
00285                             int16x8_t gr_down = vld1q_s16(g_gr_down_ptr);
00286                             g_gr_down_ptr+=8;
00287                             int16x8_t gr_here = vld1q_s16(g_gr_here_ptr);
00288                             g_gr_here_ptr+=8;
00289                             int16x8_t gr_right = vld1q_s16(g_gr_right_ptr); // unaligned
00290                             g_gr_right_ptr+=8;
00291                             
00292                             //I couldn't get this assembly to work, and I don't know which
00293                             // of the three blocks of assembly is wrong
00294                             // This asm should load the inputs
00295                             /*
00296                             asm volatile(
00297                             "vld1.16        {d16-d17}, [%[gb_up]]!\n\t"
00298                             "vld1.16        {d18-d19}, [%[gb_here]]!\n\t"
00299                             "vld1.16        {d20-d21}, [%[gb_left]]!\n\t"
00300                             "vld1.16        {d22-d23}, [%[gr_down]]!\n\t"
00301                             "vld1.16        {d24-d25}, [%[gr_here]]!\n\t"
00302                             "vld1.16        {d26-d27}, [%[gr_right]]!\n\t"
00303                             :
00304                             [gb_up]"+r"(g_gb_up_ptr),
00305                             [gb_here]"+r"(g_gb_here_ptr),
00306                             [gb_left]"+r"(g_gb_left_ptr),
00307                             [gr_down]"+r"(g_gr_down_ptr),
00308                             [gr_here]"+r"(g_gr_here_ptr),
00309                             [gr_right]"+r"(g_gr_right_ptr) :: 
00310                             //"d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27",
00311                             "q8","q9","q10","q11","q12","q13");
00312 
00313                             //q8 - gb_up
00314                             //q9 - gb_here
00315                             //q10 - gb_left
00316                             //q11 - gr_down
00317                             //q12 - gr_here
00318                             //q13 - gr_right
00319                             */
00320 
00321                             // Do the processing
00322                             int16x8_t gv_r  = vhaddq_s16(gb_up, gb_here);
00323                             int16x8_t gvd_r = vabdq_s16(gb_up, gb_here);
00324                             int16x8_t gh_r  = vhaddq_s16(gr_right, gr_here);
00325                             int16x8_t ghd_r = vabdq_s16(gr_here, gr_right);
00326                             int16x8_t g_r = vbslq_s16(vcltq_s16(ghd_r, gvd_r), gh_r, gv_r);
00327 
00328                             int16x8_t gv_b  = vhaddq_s16(gr_down, gr_here);
00329                             int16x8_t gvd_b = vabdq_s16(gr_down, gr_here);
00330                             int16x8_t gh_b  = vhaddq_s16(gb_left, gb_here);
00331                             int16x8_t ghd_b = vabdq_s16(gb_left, gb_here);
00332                             int16x8_t g_b = vbslq_s16(vcltq_s16(ghd_b, gvd_b), gh_b, gv_b);
00333                             
00334                             //this asm should do the above selection/interpolation
00335                             /*
00336                             asm volatile(
00337                             "vabd.s16       q0, q12, q13\n\t" //ghd_r
00338                             "vabd.s16       q1, q8, q9\n\t" //gvd_r
00339                             "vabd.s16       q2, q10, q9\n\t" //ghd_b
00340                             "vabd.s16       q3, q11, q12\n\t" //gvd_b
00341                             "vcgt.s16       q1, q0, q1\n\t" //select ghd_r or gvd_r
00342                             "vcgt.s16       q2, q2, q3\n\t" //select gvd_b or ghd_b
00343                             "vhadd.s16      q8, q8, q9\n\t" //gv_r
00344                             "vhadd.s16      q11, q11, q12\n\t" //gv_b
00345                             "vhadd.s16      q12, q12, q13\n\t" //gh_r
00346                             "vhadd.s16      q9, q9, q10\n\t" //gh_b
00347                             "vbsl           q1, q12, q8\n\t" //g_r
00348                             "vbsl           q2, q9, q11\n\t" //g_b
00349                              ::: "q0","q1","q2","q3","q8","q9","q10","q11","q12","q13");
00350                             */
00351 
00352                             //this should save the output
00353                             /*
00354                             asm volatile(
00355                             "vst1.16        {d2-d3}, [%[g_r]]!\n\t"
00356                             "vst1.16        {d4-d5}, [%[g_b]]!\n\t" :
00357                             [g_r]"+r"(g_r_ptr),[g_b]"+r"(g_b_ptr)
00358                             :: "memory");
00359                             */
00360                             
00361                             // Save the outputs
00362                             vst1q_s16(g_r_ptr, g_r);
00363                             g_r_ptr+=8;
00364                             vst1q_s16(g_b_ptr, g_b);
00365                             g_b_ptr+=8;
00366                         }
00367                     }
00368                 }
00369                 asm volatile ("#End of stage 2 (green interpolation)\n");
00370                 // Stages 4-9
00371 
00372                 if (1) {
00373                     
00374                     /*
00375                       r_gr = (r[LEFT] + r[HERE])/2 + gr[HERE] - (g_r[LEFT] + g_r[HERE])/2;
00376                       b_gr = (b[UP] + b[HERE])/2 + gr[HERE] - (g_b[UP] + g_b[HERE])/2;
00377                       r_gb = (r[HERE] + r[DOWN])/2 + gb[HERE] - (g_r[HERE] + g_r[DOWN])/2;
00378                       b_gb = (b[HERE] + b[RIGHT])/2 + gb[HERE] - (g_b[HERE] + g_b[RIGHT])/2;
00379                       
00380                       rp_b = (r[DOWNLEFT] + r[HERE])/2 + g_b[HERE] - (g_r[DOWNLEFT] + g_r[HERE])/2;
00381                       rn_b = (r[LEFT] + r[DOWN])/2 + g_b[HERE] - (g_r[LEFT] + g_r[DOWN])/2;
00382                       rpd_b = (r[DOWNLEFT] - r[HERE]);
00383                       rnd_b = (r[LEFT] - r[DOWN]);      
00384                       r_b = rpd_b < rnd_b ? rp_b : rn_b;                      
00385         
00386                       bp_r = (b[UPRIGHT] + b[HERE])/2 + g_r[HERE] - (g_b[UPRIGHT] + g_b[HERE])/2;
00387                       bn_r = (b[RIGHT] + b[UP])/2 + g_r[HERE] - (g_b[RIGHT] + g_b[UP])/2;       
00388                       bpd_r = (b[UPRIGHT] - b[HERE]);
00389                       bnd_r = (b[RIGHT] - b[UP]);       
00390                       b_r = bpd_r < bnd_r ? bp_r : bn_r;
00391                     */
00392 
00393                     int i = 2*VEC_WIDTH*4;
00394 
00395                     for (int y = 2; y < VEC_HEIGHT-2; y++) {
00396                         for (int x = 0; x < VEC_WIDTH; x++) {
00397 
00398                             asm volatile ("#Stage 4) r/b interpolation\n");
00399 
00400                             // Load the inputs
00401                             int16x4_t r_here       = vld1_s16(R_R(i));
00402                             int16x4_t r_left       = vld1_s16(R_R(i) - 1);
00403                             int16x4_t r_down       = vld1_s16(R_R(i) + VEC_WIDTH*4);
00404 
00405                             int16x4_t g_r_left     = vld1_s16(G_R(i) - 1);
00406                             int16x4_t g_r_here     = vld1_s16(G_R(i));
00407                             int16x4_t g_r_down     = vld1_s16(G_R(i) + VEC_WIDTH*4);
00408 
00409                             int16x4_t b_up         = vld1_s16(B_B(i) - VEC_WIDTH*4);
00410                             int16x4_t b_here       = vld1_s16(B_B(i));
00411                             int16x4_t b_right      = vld1_s16(B_B(i) + 1);
00412 
00413                             int16x4_t g_b_up       = vld1_s16(G_B(i) - VEC_WIDTH*4);
00414                             int16x4_t g_b_here     = vld1_s16(G_B(i));
00415                             int16x4_t g_b_right    = vld1_s16(G_B(i) + 1);
00416 
00417                             // Do the processing
00418                             int16x4_t gr_here      = vld1_s16(G_GR(i));
00419                             int16x4_t gb_here      = vld1_s16(G_GB(i));
00420 
00421                             { // red at green
00422                                 int16x4_t r_gr  = vadd_s16(vhadd_s16(r_left, r_here),
00423                                                             vsub_s16(gr_here,
00424                                                                       vhadd_s16(g_r_left, g_r_here)));
00425                                 int16x4_t r_gb  = vadd_s16(vhadd_s16(r_here, r_down),
00426                                                             vsub_s16(gb_here, 
00427                                                                       vhadd_s16(g_r_down, g_r_here)));
00428                                 vst1_s16(R_GR(i), r_gr);
00429                                 vst1_s16(R_GB(i), r_gb);
00430                             }
00431                             
00432                             { // red at blue
00433                                 int16x4_t r_downleft   = vld1_s16(R_R(i) + VEC_WIDTH*4 - 1);
00434                                 int16x4_t g_r_downleft = vld1_s16(G_R(i) + VEC_WIDTH*4 - 1);
00435                                 
00436                                 int16x4_t rp_b  = vadd_s16(vhadd_s16(r_downleft, r_here),
00437                                                             vsub_s16(g_b_here,
00438                                                                      vhadd_s16(g_r_downleft, g_r_here)));
00439                                 int16x4_t rn_b  = vadd_s16(vhadd_s16(r_left, r_down),
00440                                                             vsub_s16(g_b_here,
00441                                                                      vhadd_s16(g_r_left, g_r_down)));
00442                                 int16x4_t rpd_b = vabd_s16(r_downleft, r_here);
00443                                 int16x4_t rnd_b = vabd_s16(r_left, r_down);
00444                                 int16x4_t r_b   = vbsl_s16(vclt_s16(rpd_b, rnd_b), rp_b, rn_b);
00445                                 vst1_s16(R_B(i), r_b);
00446                             }
00447                             
00448                             { // blue at green
00449                                 int16x4_t b_gr  = vadd_s16(vhadd_s16(b_up, b_here),
00450                                                             vsub_s16(gr_here,
00451                                                                      vhadd_s16(g_b_up, g_b_here)));
00452                                 int16x4_t b_gb  = vadd_s16(vhadd_s16(b_here, b_right),
00453                                                             vsub_s16(gb_here,
00454                                                                      vhadd_s16(g_b_right, g_b_here)));
00455                                 vst1_s16(B_GR(i), b_gr);
00456                                 vst1_s16(B_GB(i), b_gb);
00457                             }
00458                             
00459                             { // blue at red
00460                                 int16x4_t b_upright    = vld1_s16(B_B(i) - VEC_WIDTH*4 + 1);
00461                                 int16x4_t g_b_upright  = vld1_s16(G_B(i) - VEC_WIDTH*4 + 1);
00462                                 
00463                                 int16x4_t bp_r  = vadd_s16(vhadd_s16(b_upright, b_here),
00464                                                             vsub_s16(g_r_here, 
00465                                                                       vhadd_s16(g_b_upright, g_b_here)));
00466                                 int16x4_t bn_r  = vadd_s16(vhadd_s16(b_right, b_up),
00467                                                             vsub_s16(g_r_here,
00468                                                                       vhadd_s16(g_b_right, g_b_up)));
00469                                 int16x4_t bpd_r = vabd_s16(b_upright, b_here);
00470                                 int16x4_t bnd_r = vabd_s16(b_right, b_up);
00471                                 int16x4_t b_r   = vbsl_s16(vclt_s16(bpd_r, bnd_r), bp_r, bn_r);
00472                                 vst1_s16(B_R(i), b_r);
00473                             }
00474                             
00475                             // Advance the index
00476                             i += 4;
00477                         }
00478                     }
00479                     asm volatile ("#End of stage 4 - what_ever\n\t");
00480                 }
00481 
00482                 // Stage 10)
00483                 if (1) {
00484                     // Color-correct and save the results into a 16-bit buffer for gamma correction
00485 
00486                     asm volatile ("#Stage 10) Color Correction\n");
00487 
00488                     uint16_t * __restrict__ out16Ptr = out16;
00489 
00490                     int i = 2*VEC_WIDTH*4;
00491 
00492                     const uint16x4_t bound = vdup_n_u16(1023);
00493 
00494                     for (int y = 2; y < VEC_HEIGHT-2; y++) {
00495 
00496                         // skip the first vec in each row
00497                         
00498                         int16x4x2_t r0 = vzip_s16(vld1_s16(R_GR(i)), vld1_s16(R_R(i)));
00499                         int16x4x2_t g0 = vzip_s16(vld1_s16(G_GR(i)), vld1_s16(G_R(i)));
00500                         int16x4x2_t b0 = vzip_s16(vld1_s16(B_GR(i)), vld1_s16(B_R(i)));
00501                         i += 4;
00502 
00503                         for (int x = 1; x < VEC_WIDTH; x++) {
00504                             
00505                             int16x4x2_t r1 = vzip_s16(vld1_s16(R_GR(i)), vld1_s16(R_R(i)));
00506                             int16x4x2_t g1 = vzip_s16(vld1_s16(G_GR(i)), vld1_s16(G_R(i)));
00507                             int16x4x2_t b1 = vzip_s16(vld1_s16(B_GR(i)), vld1_s16(B_R(i)));
00508                             
00509                             // do the color matrix
00510                             int32x4_t rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));                       
00511                             rout = vmlal_lane_s16(rout, r0.val[1], colorMatrix[0], 0);
00512                             rout = vmlal_lane_s16(rout, g0.val[1], colorMatrix[0], 1);
00513                             rout = vmlal_lane_s16(rout, b0.val[1], colorMatrix[0], 2);
00514                             
00515                             int32x4_t gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));                       
00516                             gout = vmlal_lane_s16(gout, r0.val[1], colorMatrix[1], 0);
00517                             gout = vmlal_lane_s16(gout, g0.val[1], colorMatrix[1], 1);
00518                             gout = vmlal_lane_s16(gout, b0.val[1], colorMatrix[1], 2);
00519                             
00520                             int32x4_t bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00521                             bout = vmlal_lane_s16(bout, r0.val[1], colorMatrix[2], 0);
00522                             bout = vmlal_lane_s16(bout, g0.val[1], colorMatrix[2], 1);
00523                             bout = vmlal_lane_s16(bout, b0.val[1], colorMatrix[2], 2);
00524                             
00525                             uint16x4x3_t col16;
00526                             col16.val[0] = vqrshrun_n_s32(rout, 8);
00527                             col16.val[1] = vqrshrun_n_s32(gout, 8);
00528                             col16.val[2] = vqrshrun_n_s32(bout, 8);                     
00529                             col16.val[0] = vmin_u16(col16.val[0], bound);
00530                             col16.val[1] = vmin_u16(col16.val[1], bound);
00531                             col16.val[2] = vmin_u16(col16.val[2], bound);
00532                             vst3_u16(out16Ptr, col16);
00533                             out16Ptr += 12;
00534                             
00535                             rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));                 
00536                             rout = vmlal_lane_s16(rout, r1.val[0], colorMatrix[0], 0);
00537                             rout = vmlal_lane_s16(rout, g1.val[0], colorMatrix[0], 1);
00538                             rout = vmlal_lane_s16(rout, b1.val[0], colorMatrix[0], 2);
00539                             
00540                             gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));                 
00541                             gout = vmlal_lane_s16(gout, r1.val[0], colorMatrix[1], 0);
00542                             gout = vmlal_lane_s16(gout, g1.val[0], colorMatrix[1], 1);
00543                             gout = vmlal_lane_s16(gout, b1.val[0], colorMatrix[1], 2);
00544                             
00545                             bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00546                             bout = vmlal_lane_s16(bout, r1.val[0], colorMatrix[2], 0);
00547                             bout = vmlal_lane_s16(bout, g1.val[0], colorMatrix[2], 1);
00548                             bout = vmlal_lane_s16(bout, b1.val[0], colorMatrix[2], 2);
00549                             
00550                             col16.val[0] = vqrshrun_n_s32(rout, 8);
00551                             col16.val[1] = vqrshrun_n_s32(gout, 8);
00552                             col16.val[2] = vqrshrun_n_s32(bout, 8);                     
00553                             col16.val[0] = vmin_u16(col16.val[0], bound);
00554                             col16.val[1] = vmin_u16(col16.val[1], bound);
00555                             col16.val[2] = vmin_u16(col16.val[2], bound);
00556                             vst3_u16(out16Ptr, col16);
00557                             out16Ptr += 12;
00558                             
00559                             r0 = r1;
00560                             g0 = g1;
00561                             b0 = b1;
00562 
00563                             i += 4;
00564                         }
00565 
00566                         // jump back
00567                         i -= VEC_WIDTH*4;
00568 
00569                         r0 = vzip_s16(vld1_s16(R_B(i)), vld1_s16(R_GB(i)));
00570                         g0 = vzip_s16(vld1_s16(G_B(i)), vld1_s16(G_GB(i)));
00571                         b0 = vzip_s16(vld1_s16(B_B(i)), vld1_s16(B_GB(i)));                     
00572                         i += 4;
00573 
00574                         for (int x = 1; x < VEC_WIDTH; x++) {
00575                             int16x4x2_t r1 = vzip_s16(vld1_s16(R_B(i)), vld1_s16(R_GB(i)));
00576                             int16x4x2_t g1 = vzip_s16(vld1_s16(G_B(i)), vld1_s16(G_GB(i)));
00577                             int16x4x2_t b1 = vzip_s16(vld1_s16(B_B(i)), vld1_s16(B_GB(i)));
00578                             
00579                             // do the color matrix
00580                             int32x4_t rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));                       
00581                             rout = vmlal_lane_s16(rout, r0.val[1], colorMatrix[0], 0);
00582                             rout = vmlal_lane_s16(rout, g0.val[1], colorMatrix[0], 1);
00583                             rout = vmlal_lane_s16(rout, b0.val[1], colorMatrix[0], 2);
00584                             
00585                             int32x4_t gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));                       
00586                             gout = vmlal_lane_s16(gout, r0.val[1], colorMatrix[1], 0);
00587                             gout = vmlal_lane_s16(gout, g0.val[1], colorMatrix[1], 1);
00588                             gout = vmlal_lane_s16(gout, b0.val[1], colorMatrix[1], 2);
00589                             
00590                             int32x4_t bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00591                             bout = vmlal_lane_s16(bout, r0.val[1], colorMatrix[2], 0);
00592                             bout = vmlal_lane_s16(bout, g0.val[1], colorMatrix[2], 1);
00593                             bout = vmlal_lane_s16(bout, b0.val[1], colorMatrix[2], 2);
00594                             
00595                             uint16x4x3_t col16;
00596                             col16.val[0] = vqrshrun_n_s32(rout, 8);
00597                             col16.val[1] = vqrshrun_n_s32(gout, 8);
00598                             col16.val[2] = vqrshrun_n_s32(bout, 8);                     
00599                             col16.val[0] = vmin_u16(col16.val[0], bound);
00600                             col16.val[1] = vmin_u16(col16.val[1], bound);
00601                             col16.val[2] = vmin_u16(col16.val[2], bound);
00602                             vst3_u16(out16Ptr, col16);
00603                             out16Ptr += 12;
00604                             
00605                             rout = vmovl_s16(vdup_lane_s16(colorMatrix[0], 3));                 
00606                             rout = vmlal_lane_s16(rout, r1.val[0], colorMatrix[0], 0);
00607                             rout = vmlal_lane_s16(rout, g1.val[0], colorMatrix[0], 1);
00608                             rout = vmlal_lane_s16(rout, b1.val[0], colorMatrix[0], 2);
00609                             
00610                             gout = vmovl_s16(vdup_lane_s16(colorMatrix[1], 3));                 
00611                             gout = vmlal_lane_s16(gout, r1.val[0], colorMatrix[1], 0);
00612                             gout = vmlal_lane_s16(gout, g1.val[0], colorMatrix[1], 1);
00613                             gout = vmlal_lane_s16(gout, b1.val[0], colorMatrix[1], 2);
00614                             
00615                             bout = vmovl_s16(vdup_lane_s16(colorMatrix[2], 3));
00616                             bout = vmlal_lane_s16(bout, r1.val[0], colorMatrix[2], 0);
00617                             bout = vmlal_lane_s16(bout, g1.val[0], colorMatrix[2], 1);
00618                             bout = vmlal_lane_s16(bout, b1.val[0], colorMatrix[2], 2);
00619                             
00620                             col16.val[0] = vqrshrun_n_s32(rout, 8);
00621                             col16.val[1] = vqrshrun_n_s32(gout, 8);
00622                             col16.val[2] = vqrshrun_n_s32(bout, 8);                     
00623                             col16.val[0] = vmin_u16(col16.val[0], bound);
00624                             col16.val[1] = vmin_u16(col16.val[1], bound);
00625                             col16.val[2] = vmin_u16(col16.val[2], bound);
00626                             vst3_u16(out16Ptr, col16);
00627                             out16Ptr += 12;
00628                             
00629                             r0 = r1;
00630                             g0 = g1;
00631                             b0 = b1;
00632 
00633                             i += 4;
00634                         }
00635                     }   
00636                     asm volatile("#End of stage 10) - color correction\n\t");
00637                 }
00638                 
00639 
00640                 if (1) {
00641 
00642                     asm volatile("#Gamma Correction\n");                   
00643                     // Gamma correction (on the CPU, not the NEON)
00644                     const uint16_t * __restrict__ out16Ptr = out16;
00645                     
00646                     for (int y = 0; y < BLOCK_HEIGHT; y++) {                    
00647                         unsigned int * __restrict__ outPtr32 = (unsigned int *)(outBlockPtr + y * outWidth * 3);
00648                         for (int x = 0; x < (BLOCK_WIDTH*3)/4; x++) {
00649                             unsigned val = ((lut[out16Ptr[0]] << 0) |
00650                                             (lut[out16Ptr[1]] << 8) | 
00651                                             (lut[out16Ptr[2]] << 16) |
00652                                             (lut[out16Ptr[3]] << 24));
00653                             *outPtr32++ = val;
00654                             out16Ptr += 4;
00655                             // *outPtr++ = lut[*out16Ptr++];
00656                         }
00657                     }           
00658                     asm volatile("#end of Gamma Correction\n");                   
00659                     
00660                     /*
00661                     const uint16_t * __restrict__ out16Ptr = out16;                 
00662                     for (int y = 0; y < BLOCK_HEIGHT; y++) {                    
00663                         unsigned char * __restrict__ outPtr = (outBlockPtr + y * outWidth * 3);
00664                         for (int x = 0; x < (BLOCK_WIDTH*3); x++) {
00665                             *outPtr++ = lut[*out16Ptr++];
00666                         }
00667                     }
00668                     */
00669                     
00670                 }
00671                 
00672 
00673                 blockPtr += BLOCK_WIDTH;
00674                 outBlockPtr += BLOCK_WIDTH*3;
00675             }
00676         }       
00677 
00678         //std::cout << "Done demosaicking. time = " << ((Time::now() - startTime)/1000) << std::endl;
00679         return out;
00680     }
00681 
00682     Image makeThumbnailRAW_ARM(Frame src, float contrast, int blackLevel, float gamma) {
00683         // Assuming we want a slightly-cropped thumbnail into 640x480, Bayer pattern GRBG
00684         // This means averaging together a 4x4 block of Bayer pattern for one RGB24 pixel
00685         // Also want to convert to sRGB, which includes a color matrix multiply and a gamma transform
00686         // using a lookup table.
00687 
00688         // Implementation: 
00689         //   Uses ARM NEON SIMD vector instructions and inline assembly.
00690         //   Reads in a 16x4 block of pixels at a time, in 16-bit GRBG Bayer format, and outputs a 4x1 block of RGB24 pixels.
00691         // Important note: Due to some apparent bugs in GCC's inline assembly register mapping between C variables and NEON registers,
00692         //   namely that trying to reference an int16x4 variable creates a reference to a s register instead of a d register, all the
00693         //   int16x4 variables are forced into specific NEON registers, and then referred to using that register, not by name.  
00694         //   This bug seems to be in gcc 4.2.1, should be fixed by 4.4 based on some gcc bug reports.
00695 
00696         Image thumb(640, 480, RGB24);
00697         const unsigned int w = 2592, tw = 640;
00698         const unsigned int h = 1968, th = 480;
00699         const unsigned int scale = 4;
00700         const unsigned int cw = tw*scale;
00701         const unsigned int ch = th*scale;
00702         const unsigned int startX = (w-cw)/2;
00703         const unsigned int startY = (h-ch)/2;        
00704         const unsigned int bytesPerRow = src.image().bytesPerRow();
00705 
00706         // Make the response curve
00707         unsigned char lut[4096];
00708         makeLUT(src, contrast, blackLevel, gamma, lut);
00709 
00710         unsigned char *row = src.image()(startX, startY);
00711 
00712         Time startTime = Time::now();
00713         float colorMatrix_f[12];
00714         src.rawToRGBColorMatrix(colorMatrix_f);
00715 
00716         register int16x4_t colorMatrix0 asm ("d0"); // ASM assignments are essential - they're implicitly trusted by the inline code.
00717         register int16x4_t colorMatrix1 asm ("d1");
00718         register int16x4_t colorMatrix2 asm ("d2");
00719         register int16x4_t wCoord asm ("d20"); // Workaround for annoyances with scalar addition.
00720         register int16x4_t maxValue asm ("d21"); // Maximum allowed signed 16-bit value
00721         register int16x4_t minValue asm ("d22"); // Minimum allowed signed 16-bit value
00722 
00723         asm volatile(
00724                     // Load matrix into colorMatrix0-2, set to be d0-d2
00725                     "vldm %[colorMatrix_f], {q2,q3,q4}  \n\t"
00726                     "vcvt.s32.f32 q2, q2, #8  \n\t" // Float->fixed-point conversion
00727                     "vcvt.s32.f32 q3, q3, #8  \n\t"
00728                     "vcvt.s32.f32 q4, q4, #8  \n\t"
00729                     "vmovn.i32 d0, q2  \n\t" // Narrowing to 16-bit
00730                     "vmovn.i32 d1, q3  \n\t"
00731                     "vmovn.i32 d2, q4  \n\t"
00732                     // Load homogenous coordinate, pixel value limits
00733                     "vmov.i16  d20, #0x4   \n\t"  // Homogenous coordinate. 
00734                     "vmov.i16  d21, #0x00FF  \n\t"  // Maximum pixel value: 1023
00735                     "vorr.i16  d21, #0x0300  \n\t"  // Maximum pixel value part 2
00736                     "vmov.i16  d22, #0x0     \n\t"  // Minimum pixel value: 0
00737                     : [colorMatrix0] "=w" (colorMatrix0),
00738                       [colorMatrix1] "=w" (colorMatrix1),
00739                       [colorMatrix2] "=w" (colorMatrix2),
00740                       [wCoord] "=w" (wCoord),
00741                       [maxValue] "=w" (maxValue),
00742                       [minValue] "=w" (minValue)
00743                     :  [colorMatrix_f] "r" (colorMatrix_f)
00744                     : "memory",
00745                       "d3", "d4", "d5", "d6", "d7", "d8", "d9");
00746                 
00747         for (unsigned int ty = 0; ty <480; ty++, row+=4*bytesPerRow) {
00748             register unsigned short *px0 = (unsigned short *)row;
00749             register unsigned short *px1 = (unsigned short *)(row+1*bytesPerRow);
00750             register unsigned short *px2 = (unsigned short *)(row+2*bytesPerRow);
00751             register unsigned short *px3 = (unsigned short *)(row+3*bytesPerRow);
00752 
00753             register unsigned char *dst = thumb(0,ty);
00754             for (register unsigned int tx =0; tx < 640; tx+=scale) {
00755                 // Assembly block for fast downsample/demosaic, color correction, and gamma curve lookup
00756                 asm volatile(
00757                     // *px0: GRGR GRGR GRGR GRGR
00758                     // *px1: BGBG BGBG BGBG BGBG
00759                     // *px2: GRGR GRGR GRGR GRGR
00760                     // *px3: BGBG BGBG BGBG BGBG
00762                     "vld2.16 {d4-d7}, [%[px0]]!  \n\t"
00763                     "vld2.16 {d8-d11}, [%[px1]]! \n\t"
00764                     "vld2.16 {d12-d15}, [%[px2]]! \n\t"
00765                     "vld2.16 {d16-d19}, [%[px3]]! \n\t"
00766                     //  d4    d5    d6    d7
00767                     // GG|GG GG|GG RR|RR RR|RR
00768                     //  d8    d9    d10   d11
00769                     // BB|BB BB|BB GG|GG GG|GG
00770                     //  d12   d13   d14   d15
00771                     // GG|GG GG|GG RR|RR RR|RR
00772                     //  d16   d17   d18   d19
00773                     // BB|BB BB|BB GG|GG GG|GG
00774 
00776                     "vpadd.u16 d4, d4, d5  \n\t"   // G1
00777                     "vpadd.u16 d5, d6, d7  \n\t"   // R1
00778                     "vpadd.u16 d6, d8, d9  \n\t"   // B1
00779                     "vpadd.u16 d7, d10, d11 \n\t"  // G2
00780                     "vpadd.u16 d8, d12, d13 \n\t"  // G3
00781                     "vpadd.u16 d9, d14, d15 \n\t"  // R2
00782                     "vpadd.u16 d10, d16, d17 \n\t" // B2
00783                     "vpadd.u16 d11, d18, d19 \n\t" // G4
00784                     //    d4       d5       d6       d7
00785                     // G|G|G|G  R|R|R|R  B|B|B|B  G|G|G|G
00786                     //    d8       d9       d10      d11
00787                     // G|G|G|G  R|R|R|R  B|B|B|B  G|G|G|G
00788 
00790                     "vadd.u16 d7, d8   \n\t"
00791                     "vadd.u16 d4, d11   \n\t"
00792                     "vhadd.u16 d4, d7  \n\t"
00794                     "vadd.u16 d5, d9  \n\t"
00796                     "vadd.u16 d6, d10 \n\t"
00797                     //    d4       d5       d6  
00798                     // G|G|G|G  R|R|R|R  B|B|B|B
00799                     //
00800                     // Assuming sRGB affine matrix stored in fixed precision (lsb = 1/256)
00801                     // Trusting GCC to properly assign colorMatrix0-2 to d0-d2.  Direct reference seems to be broken on g++ 4.2.1 at least.
00802                     // r   colorMatrix0[0] [1] [2] [3]   r_in
00803                     // g = colorMatrix1[0] [1] [2] [3] * g_in
00804                     // b   colorMatrix2[0] [1] [2] [3]   b_in
00805 
00807 
00808                     "vmull.s16 q4, d5, d0[0] \n\t"
00809                     "vmlal.s16 q4, d4, d0[1] \n\t"
00810                     "vmlal.s16 q4, d6, d0[2] \n\t"
00811                     "vmlal.s16 q4, d20, d0[3] \n\t" 
00812 
00813                     "vmull.s16 q5, d5, d1[0] \n\t"
00814                     "vmlal.s16 q5, d4, d1[1] \n\t"
00815                     "vmlal.s16 q5, d6, d1[2] \n\t"
00816                     "vmlal.s16 q5, d20, d1[3] \n\t"
00817 
00818                     "vmull.s16 q6, d5, d2[0] \n\t"
00819                     "vmlal.s16 q6, d4, d2[1] \n\t"
00820                     "vmlal.s16 q6, d6, d2[2] \n\t"
00821                     "vmlal.s16 q6, d20, d2[3] \n\t"
00822 
00823                     //  d08  d09  d10  d11  d12  d13
00824                     //  R|R  R|R  G|G  G|G  B|B  B|B
00825 
00827                     "vrshrn.s32 d3, q4, #10  \n\t"
00828                     "vrshrn.s32 d4, q5, #10  \n\t"
00829                     "vrshrn.s32 d5, q6, #10  \n\t"
00831                     "vmin.s16 d3, d3, d21    \n\t"
00832                     "vmin.s16 d4, d4, d21    \n\t"
00833                     "vmin.s16 d5, d5, d21    \n\t"
00834                     "vmax.s16 d3, d3, d22    \n\t"
00835                     "vmax.s16 d4, d4, d22    \n\t"
00836                     "vmax.s16 d5, d5, d22    \n\t"
00837 
00838                     //    d3       d4       d2
00839                     // R|R|R|R  G|G|G|G  B|B|B|B
00840                     
00842                     "vmov r0,r1, d3                        \n\t"
00843                     //    r0       r1
00844                     // R16|R16  R16|R16
00845                     "uxth r2, r0                           \n\t" // Extract first red pixel into r2
00846                     "ldrb r4, [%[gammaTable], r2]          \n\t" // Table lookup, byte result
00847 
00848                     "uxth r2, r0, ROR #16                  \n\t"
00849                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00850                     "orr  r4, r4, r3, LSL #24              \n\t"
00851 
00852                     "uxth r2, r1                           \n\t"
00853                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00854                     "mov  r5, r3, LSL #16                  \n\t"
00855 
00856                     "uxth r2, r1, ROR #16                  \n\t"
00857                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00858                     "mov  r6, r3, LSL #8                   \n\t"
00859 
00860                     //   r4   r5   r6  
00861                     //  R__R __R_ _R__  -> increasing mem address (and increasing left shift)
00862 
00863                     "vmov r0,r1, d4                        \n\t"
00864                     //    r0       r1
00865                     // G16|G16  G16|G16
00866                     "uxth r2, r0                           \n\t"
00867                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00868                     "orr  r4, r4, r3, LSL #8               \n\t"
00869 
00870                     "uxth r2, r0, ROR #16                  \n\t"
00871                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00872                     "orr  r5, r5, r3                       \n\t"
00873 
00874                     "uxth r2, r1                           \n\t"
00875                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00876                     "orr  r5, r5, r3, LSL #24              \n\t"
00877 
00878                     "uxth r2, r1, ROR #16                  \n\t"
00879                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00880                     "orr  r6, r6, r3, LSL #16              \n\t"
00881 
00882                     //   r4   r5   r6  
00883                     //  RG_R G_RG _RG_  -> increasing mem address (and increasing left shift)
00884 
00885                     "vmov r0,r1, d5                        \n\t"
00886                     //    r0       r1
00887                     // B16|B16  B16|B16
00888                     "uxth r2, r0                           \n\t"
00889                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00890                     "orr  r4, r4, r3, LSL #16              \n\t"
00891 
00892                     "uxth r2, r0, ROR #16                  \n\t"
00893                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00894                     "orr  r5, r5, r3, LSL #8               \n\t"
00895 
00896                     "uxth r2, r1                           \n\t"
00897                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00898                     "orr  r6, r6, r3                       \n\t"
00899 
00900                     "uxth r2, r1, ROR #16                  \n\t"
00901                     "ldrb r3, [%[gammaTable], r2]          \n\t"
00902                     "orr  r6, r6, r3, LSL #24              \n\t"
00903 
00904                     //   r4   r5   r6  
00905                     //  RGBR GBRG BRGB 
00906 
00907                     "stm %[dst]!, {r4,r5,r6}                   \n\t" // multi-store!
00908                     : [px0] "+&r" (px0),
00909                       [px1] "+&r" (px1),
00910                       [px2] "+&r" (px2),
00911                       [px3] "+&r" (px3),
00912                       [dst] "+&r" (dst)
00913                     : [gammaTable] "r" (lut),
00914                       [colorMatrix0] "w" (colorMatrix0), // Implicitly referenced only (d0)
00915                       [colorMatrix1] "w" (colorMatrix1), // Implicitly referenced only (d1)
00916                       [colorMatrix2] "w" (colorMatrix2), // Implicitly referenced only (d2)
00917                       [wCoord] "w" (wCoord),             // Implicitly referenced only (d20)
00918                       [maxValue] "w" (maxValue),         // Implicitly referenced only (d21)
00919                       [minValue] "w" (minValue)          // Implicitly referenced only (d22)
00920                     : "memory", 
00921                       "r0", "r1", "r2", "r3", "r4", "r5", "r6",
00922                       "d3", "d4", "d5", "d6",
00923                       "d7", "d8", "d9", "d10", 
00924                       "d11", "d12", "d13", "d14",
00925                       "d15", "d16", "d17", "d18", "d19"
00926                     );
00927 
00928             }            
00929         }
00930         
00931         //std::cout << "Done creating fast thumbnail. time = " << ((Time::now()-startTime)/1000) << std::endl;
00932 
00933         return thumb;
00934     }
00935 }
00936 
00937 
00938 #endif

Generated on Thu Jul 22 2010 17:50:34 for FCam by  doxygen 1.7.1