/* Copyright (C) 2014 Jerome Revaud This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see */ #include "conv.h" #include "std.h" #include "omp.h" #include "maxfilter.h" extern "C" { #include #define integer int #define real float extern int saxpy_(integer *n, real *sa, real *sx, integer *incx, real *sy, integer *incy); extern int sscal_(integer *n, real *sa, real *sx, integer *incx); } static inline void fast_set_val( float * __restrict__ a, long d, const float val) { if(val) { int j; for(j=0; j>2; // while(e--) *a4++ *= (*divi4); int j; for(j=0; jtx) dx=tx; // after those alues, nothing happens anyway if(dy>ty) dy=ty; if(-dx>tx) dx=-tx; if(-dy>ty) dy=-ty; #define add_default(n) {fast_set_val(dest,(n),mul*def); dest+=(n);} float* _dest = dest; // paste -v zeros rows if(dy<0) add_default(-dy*(tx+ex)); src += MAX(0,dx); const int row_len = MIN(tx,tx+dx+ex) - MAX(0,dx); int j; for(j=MAX(0,dy); j=0) {add_default(dx) if(ex) add_default(ex)} } // paste +v zeros rows if(dy>=0){add_default(dy*(tx+ex)) if(ex) add_default(ex*(tx+ex))} #undef add_default assert( dest-_dest == (tx+ex)*(ty+ex) ); return dest; } static inline float* fast_add_trans( float * dest, const float * src, const float mul, int dx, int dy, const int tx, const int ty, const int ex, const float def ) { if(mul==0) return dest+(tx+ex)*(ty+ex); if(dx>tx) dx=tx; // after those alues, nothing happens anyway if(dy>ty) dy=ty; if(-dx>tx) dx=-tx; if(-dy>ty) dy=-ty; #define add_default(n) {fast_add_val(dest,n,def*mul); dest+=n;} float* _dest = dest; // paste -v zeros rows if(dy<0) add_default(-dy*(tx+ex)); src += MAX(0,dx); const int row_len = MIN(tx,tx+dx+ex) - MAX(0,dx); int j; for(j=MAX(0,dy); j=0) {add_default(dx) if(ex) add_default(ex)} } // paste +v zeros rows if(dy>=0){add_default(dy*(tx+ex)) if(ex) add_default(ex*(tx+ex))} #undef add_default assert( dest-_dest == (tx+ex)*(ty+ex) ); return dest; } static inline void norm_norm( float* norms, int nb, float mode ) { int i; if( mode < 0 ) assert(!"error: unknown norm mode"); else if( mode == 0.5 ) { for(i=0; i 1 ) assert(!"error: unknown norm mode"); } /* normalize each pixel of a multi-layers image norm = {0:nothing, 1:L2-normalization, 0-1: normalization by (L2-norm)** } */ void norm_layers( float_layers* res, float norm, int n_thread ) { if(norm==0) return; const int layer_size = res->tx*res->ty; const int n_layers = res->tz; float* norms = NEWAC(float,layer_size); long l; for(l=0; lpixels + l*layer_size; int i; #if defined(USE_OPENMP) #pragma omp parallel for num_threads(n_thread) #endif for(i=0; ipixels + l*layer_size; int i; #if defined(USE_OPENMP) #pragma omp parallel for num_threads(n_thread) #endif for(i=0; itz; // number of dimensions of an atomic patch descriptor } /* Sample a set of patches from a HOG image. grid : array of (x,y) position of the patches size: size of the patches, ie. [x,x+size[ x [y,y+size[ res: result array, n_patches x desc_dim desc_dim = n_layers * size**2 norms: result, n_patches x 1, norm of each patch */ void _sample_patches( float_layers* hog, float_layers* color, int_image* grid, int size, float norm, float_image* res, float_array* norms, int n_thread ) { const int tx = hog->tx; const long npix = tx*hog->ty; assert( grid->tx == 2 ); const int n_patches = grid->ty; assert( res->ty == n_patches ); const int n_layers = hog->tz; const int n_colors = (color? color->tz: 0); const int color_npix = (color? color->tx*color->ty: 0); const int desc_size = size*size*n_layers + (color? color->tz: 0); assert(res->tx == desc_size ); int n; #if defined(USE_OPENMP) #pragma omp parallel for num_threads(n_thread) #endif for(n=0; npixels + desc_size*n; int *p = grid->pixels + 2*n; // copy hog int x=p[0],y=p[1]; assert(0<=x && x+size<=tx); assert(0<=y && y+size<=hog->ty); int l,j; for(l=0; lpixels + l*npix + y*tx + x; for(j=0; jpixels + (y+size/2)*color->ty + (x+size/2); for(l=0; lpixels : NEWAC(float, n_patches); if(norms) { assert(norms->tx==n_patches); memset(normp,0,n_patches*sizeof(float)); } #if defined(USE_OPENMP) #pragma omp parallel for num_threads(n_thread) #endif for(n=0; npixels + desc_size*n; int l; for(l=0; lpixels + desc_size*n; int l; float nn = normp[n]+1e-8; for(l=0; lpixels[0]; const int step0 = child_grid->tx==1 && child_grid->ty==1 ? 1 : MAX( child_grid->pixels[2]-child_grid->pixels[0], child_grid->pixels[1+2*child_grid->tx]-child_grid->pixels[1] ); int i = (x-size0_div2)/step0; int j = (y-size0_div2)/step0; assert( x==(i*step0+size0_div2) || !"error: child_grid does not match current grid" ); assert( y==(j*step0+size0_div2) || !"error: child_grid does not match current grid" ); if( i<0 || i>=child_grid->tx ) return -1; if( j<0 || j>=child_grid->ty ) return -1; return i+j*child_grid->tx; } /* Prepare a grid of cell positions in the first image for a given scale. Big cells inherit the cell at the previous scale. size = size of cells at current scale offset, step = grid generator: (offset + i*step, offset + j*step) child_grid = grid of the previous layer (or None if first layer) child_norms = image containing the norms of the patch at the previous level grid = result center positions of cells in current scale children = index of cells in previous scale used to construct big cells norms = norms of the cells of this level */ void _prepare_big_cells( int size, int offset, int step, int_cube* child_grid, float_image* child_norms, int_cube* grid, int_cube* children, float_image* norms ) { assert(grid->tz==2); const int ntx = grid->tx; // should be == 1+(tx-size)/step so that patches do not pass the border const int nty = grid->ty; // should be == 1+(ty-size)/step so that patches do not pass the border /* grid[i,j] = ( offset + i*step, offset + j*step ) connection between two scales: x cell position in lower scale == x position of children in upper scale child_offset + child_i*child_step = offset + i*step + (2*u/(nc-1)-1)*size/4 */ int i,j,u,v; int* r = grid->pixels; if( !child_grid ) { // this is the first scale: // we just return a grid of step size*(1-overlap/2) in [0, tx[ x [0, ty[ for(j=0; jtz==2); ASSERT_SAME_SIZE( child_grid, child_norms ); assert( children ); const int nc = sqrt(children->tz); // number of children per row or col assert( children->tz==pow2(nc) ); ASSERT_SAME_SIZE( grid, children ); ASSERT_SAME_SIZE( grid, norms ); // this is at least second scale // we return a grid of step size*(1-overlap/2) in [0, tx[ x [0, ty[ const int quarter = size/4; assert(4*quarter==size); int* c = children->pixels; float *n = norms->pixels; memset(n,0,ntx*nty*sizeof(float)); for(j=0; j=0) *n += child_norms->pixels[*c]; } n++; } } } /* Prepare image for dotprod : dot(patches, res) where patches is n_patches x patch_dim set outside of the image to be equal to (0,...,ninth_val) */ void _prepare_dotprod_convolution( float_layers* img, int patch_size, float ninth_val, int extend, float_layers* res, int n_thread ) { assert( img->tx+extend == res->tx ); assert( img->ty+extend == res->ty ); const int n_layers = img->tz; const int tx = img->tx; const int ty = img->ty; const int npix = tx*ty; const int npixex = (tx+extend)*(ty+extend); assert( res->tz==patch_size*patch_size*img->tz ); long l; const int first_half = patch_size/2; // half-size const int second_half = patch_size - first_half; const int layer_size = patch_size*patch_size*npixex; #if defined(USE_OPENMP) #pragma omp parallel for num_threads(n_thread) #endif for(l=0; lpixels + l*npix; float* r = res->pixels + l*layer_size; int u,v; // copy translated version of the image into res for(v=-first_half; vtx+extend; // extend a bit the image const int ety = hog->ty+extend; float_layers* res = NEW(float_layers); *res = empty_layers(float,etx,ety,nh); float ninth_val = 0; _prepare_dotprod_convolution( hog, patch_size, ninth_val, extend, res, nt ); if( norm ) norm_layers( res, norm, nt ); return res; } inline float sum_array_f(const float* a, int n) { int i=n; double res = 0; while(i--) res+=a[i]; return (float)res; } extern "C" { int sgemm_(char *transa, char *transb, integer *m, integer * n, integer *k, float *alpha, float *a, integer *lda, float *b, integer * ldb, float *beta, float *c, integer *ldc); } /* matrix-matrix multiplication with several SGEMM (each is single-threaded) res = dot(patches, convolved_hog) P*npix P * nh nh * npix */ void _dotprod( float_image* patches, float_layers* convolved_hog, float_layers* res, int n_thread ) { int nh = patches->tx; assert( nh == convolved_hog->tz ); ASSERT_SAME_IMG_SIZE( convolved_hog, res ); int P = patches->ty; assert( res->tz == P ); int threadP = 1 + (P-1) / n_thread; // how many patches per thread int npix = (int)IMG_SIZE(convolved_hog); int l; #if (defined(USE_OPENMP) && !defined(MULTITHREADED_BLAS)) #pragma omp parallel for num_threads(n_thread) #else n_thread = 1; // BLAS is already multithreaded threadP = P; #endif for(l=0; lpixels + nh*start; float* r = res->pixels + npix*start; // blas fast matrix-matrix product char T='n'; float alpha = 1, beta = 0; sgemm_( &T, &T, &npix, &np, &nh, &alpha, convolved_hog->pixels, &npix, p, &nh, &beta, r, &npix); } } inline void transpose_scalar_block(const float *A, float *B, const int lda, const int ldb, const int block_row, const int block_col) { for(int i=0; ity, m = A->tx; assert( n==B->tx && m==B->ty ); const int block_size = 16; const float* pA = A->pixels; float* pB = B->pixels; #ifdef USE_OPENMP #pragma omp parallel for num_threads(nt) #endif for(int i=0; itx; assert( nh == convolved_hog->tz ); const int P = patches->ty; assert( IMG_SIZE(grid)==P && grid->tz==2 ); const int tx = convolved_hog->tx; const int ty = convolved_hog->ty; // neighborhood size int res_tx = MIN(tx,2*ngh_rad); int res_ty = MIN(ty,2*ngh_rad); assert(res_txpixels || !"error: ran out of memory before sgemm"); *offsets = empty_image(int, 2, P); char T='t'; float alpha=1, beta=0; int one=1; #if defined(USE_OPENMP) #pragma omp parallel for num_threads(n_thread) #endif for(int j=0; jpixels + l*nh; float* r = res_out->pixels + l*res_npix; int left = MAX(0, MIN(grid->pixels[2*l+0] - ngh_rad, tx-2*ngh_rad)); int top = MAX(0, MIN(grid->pixels[2*l+1] - ngh_rad, ty-2*ngh_rad)); if(j==0) { offsets->pixels[2*l+0] = left; offsets->pixels[2*l+1] = top; } float* c = convolved_hog->pixels + (left + top*tx)*nh; // blas fast matrix-vector product sgemv_( &T, &nh, &res_tx, &alpha, c + j*tx*nh, &nh, p, &one, &beta, r + j*res_tx, &one); } } } /* correct the convolution on the boundaries of the image ttx, tty: true shape of the res_map (in case of using offsets) */ void rectify_conv( int patch_size, int nori, float_image* patches, int_image* offsets, const int ttx, const int tty, int extend, float_layers* res, int n_thread ) { const int n_patches = patches->ty; assert( n_patches == res->tz ); //const int nori = patches->tx/pow2(patch_size); assert( patches->tx >= nori*pow2(patch_size) ); const int tx = res->tx; // real true shape because it has been extended const int ty = res->ty; const int first_half = patch_size/2; const int second_half = patch_size - first_half; // in case patch_size is odd assert( offsets || (ttx==tx && tty==ty) ); assert( !offsets || (ttx>=tx && tty>=ty) ); assert( !offsets || (offsets->ty==res->tz && offsets->tx==2) ); const long npix = IMG_SIZE(res); int l; #if defined(USE_OPENMP) #pragma omp parallel for num_threads(n_thread) #endif for(l=0; lpixels[2*l+0] : 0; const int offj = offsets ? offsets->pixels[2*l+1] : 0; float sums[8]; // temporary norm of columns or rows assert( patch_size <= (int)(sizeof(sums)/sizeof(sums[0])) ); int o,i,j; // horizontal boundaries memset(sums,0,sizeof(sums)); float* p = patches->pixels + l*patches->tx; for(o=0; opixels + l*npix + (j-offj)*tx; for(i=0; ipixels + l*npix + (j-offj)*tx; for(i=0; ipixels + l*patches->tx; for(o=0; opixels + l*npix + (i-offi); for(j=0; jpixels + l*npix + (i-offi); for(j=0; jpixels + l*npix; for(j=offj; j0, the correlation is only computed in a small local neighborhood (whose size is parameterized by ngh_rad). if extend: width and height of output maps are extended if norm: correlation are normalized afterwards. */ void fastconv( float_image* patches, float_layers* hog, int patch_size, int ngh_rad, int extend, float norm, int nt, res_scale* res ) { assert(0<=extend and extend<=1); float_layers* convolved_hog = prepare_dotprod_convolution( hog, patch_size, extend, norm, nt ); assert( patches->tx==convolved_hog->tz); res->true_shape[0] = convolved_hog->tx; res->true_shape[1] = convolved_hog->ty; //hash_layers(convolved_hog) int_image* offsets = NULL; if( ngh_rad == 0 ) { // no limit on translation // allocate result res->res_map = empty_layers(float, convolved_hog->tx, convolved_hog->ty, patches->ty); assert(res->res_map.pixels || !"error: ran out of memory before sgemm"); // multi-threaded fast matrix product _dotprod( patches, convolved_hog, &res->res_map, nt ); } else { // ngh_rad>0: cropping res_map offsets = &res->offsets; // transpose hog: _dotprod is much faster this way float_cube convolved_hog_T = empty_cube(float, convolved_hog->tx, convolved_hog->ty, convolved_hog->tz); { float_image A = reshape_xy_z(float, convolved_hog); // cast to 2D matrix without copy float_image B = reshape_z_xy(float, &convolved_hog_T); transpose_matrix( &A, &B, nt); } //hash_cube(&convolved_hog_T) // resized grid int_cube fgrid = cube_like(int, &res->grid); for(int i=0; igrid.pixels[i]/res->f; //hash_cube(&fgrid) // multi-threaded fast matrix product _dotprod_ngh_rad_T( &fgrid, patches, ngh_rad, &convolved_hog_T, &res->res_map, offsets, nt ); free(fgrid.pixels); free(convolved_hog_T.pixels); //hash_image(offsets) } free_layers(convolved_hog); // correct border effects on the correlation maps rectify_conv( patch_size, hog->tz, patches, offsets, res->true_shape[0], res->true_shape[1], extend, &res->res_map, nt ); } /* Compute: arr **= p */ void fastipow( float_layers* arr, const float p, int n_thread ) { const int n_layers = arr->tz; const long npix = arr->tx*arr->ty; int l; // optimization: precompute some values of pow(x,p) const int npc = 64; float precom[npc+1]; for(l=0; l<=npc; l++) precom[l]= pow(l/(float)npc,p); const float maxindex = npc - 0.001; #if defined(USE_OPENMP) #pragma omp parallel for num_threads(n_thread) #endif for(l=0; lpixels + l*npix; int i; for(i=0; i=0 && vmaxindex) v=maxindex; int n = int(v); float w = v-n; a[i] = (1-w)*precom[n] + w*precom[n+1]; } } } /* Compute: arr = max(0,(arr-p)/(1-p)) */ void fasthinge( float_layers* arr, const float p, int n_thread ) { const int n_layers = arr->tz; const long npix = arr->tx*arr->ty; int l; const float f = 1/(1-p); #if defined(USE_OPENMP) #pragma omp parallel for num_threads(n_thread) #endif for(l=0; lpixels + l*npix; int i; for(i=0; ires) res=a[i]; return res; } /* Normalize weights in border areas of width . There are 9 areas: top-left, top-middle, top-right, ..., bottom-right. sum_divf indicates the current weight in those areas, i.e. values in the area should be divided by the weight. But trans_inv allow to control the amount of normalization: 0=no normalization, 1=normal */ static inline void normalize_trans(const int tx, const int ty, const int gap, float* rmap, const float trans_inv, float sum_divf[9] ) { if( trans_inv == 0 ) return; int i,j; for(i=0; i<9; i++) { if( sum_divf[i]>0 ) sum_divf[i] = 1/pow(sum_divf[i], trans_inv); // if trans_inv==1, no effect } for(j=0; j on and put the result in . A standard order is assumed on the children: a response map #p is built from the children[p] at positions [(gap*dx,gap*dy) for dy in dys for dx in dxs] where dxs = [-1,1] or [-1,0,1] dys = [-1,1] or [-1,0,1] child_assign denote assignement of the children level, while assign is for the next level child_norms contain the norms of small patches and norms for big new cells */ int _sparse_conv( int_image* children, int_array* child_assign, int gap, float trans_inv, float_layers* child_map, int_image* offsets, float_array* child_norms, float_array* norms, int_array* assign, float_layers* res, int_image* res_offsets, int n_thread ) { const int nconv = children->ty; // number of convolutions to perform const int nc2 = children->tx; const int nc = sqrt(nc2); assert( nc*nc == nc2 ); assert( res->tz == nconv ); const int tx = child_map->tx; const int ty = child_map->ty; const long npix = tx*ty; ASSERT_SAME_SIZE( child_map, res ); const int n_lower_conv = max_array_i(children->pixels,nconv*nc2)+1; int* cass = child_assign ? child_assign->pixels : NEWA(int,n_lower_conv); if(!child_assign) {for(int i=0; ipixels && offsets->tx==2 && offsets->ty==n_lower_conv && res_offsets && res_offsets->tx==2 && res_offsets->ty==nconv) ); if(assign) { assert(0); // not supposed to happen } else { // normal case: no redundancy to exploit in response maps int l; #if defined(USE_OPENMP) #pragma omp parallel for num_threads(n_thread) #endif for(l=0; lpixels + l*npix; int u,v,c,ncall=0; // children number const int* const child = children->pixels + l*nc2; float sum_divf[9]; memset(sum_divf,0,sizeof(sum_divf)); int i,j; // first, choose an offset for the result rmap from the child offsets int offx=0, offy=0; if( offsets ) { int sum_ox=0, sum_oy=0, w=0; for(c=v=0; vpixels[2*child[c]+0] - dx*gap; sum_oy += offsets->pixels[2*child[c]+1] - dy*gap; w++; } } if(w==0) w++; // just in case offx = (int)floor(0.5 + sum_ox/float(w)); offy = (int)floor(0.5 + sum_oy/float(w)); // store result for later res_offsets->pixels[2*l+0] = offx; res_offsets->pixels[2*l+1] = offy; } for(c=v=0; vpixels[child[c]]/norms->pixels[l]; // difference with rmap's offset const int trans_x = dx*gap + (offsets? offx - offsets->pixels[2*child[c]+0] : 0); const int trans_y = dy*gap + (offsets? offy - offsets->pixels[2*child[c]+1] : 0); // count the sum of weights in every image area for(i=-1; i<=1; i++)for(j=-1; j<=1; j++) if(i*trans_x<=0 && j*trans_y<=0) sum_divf[4+j*3+i] += divf; // add a translated version of map[children[c]] by (ox-dx,oy-dy) if(ncall++==0) // first call fast_set_trans( rmap, child_map->pixels + cass[child[c]]*npix, divf, trans_x,trans_y, tx,ty, 0, 0 ); else fast_add_trans( rmap, child_map->pixels + cass[child[c]]*npix, divf, trans_x,trans_y, tx,ty, 0, 0 ); } } if( ncall == 0) // default = zeros memset(rmap, 0, npix*sizeof(float)); // now we are supposed to rectify the boundaries (to perfect convolution) normalize_trans(tx, ty, gap, rmap, trans_inv, sum_divf ); //assert(min_array_f(rmap,npix)>=0 && max_array_f(rmap,npix)<=1.001); } } if(!child_assign) free(cass); #define CHECK_MAPS(rmaps) assert(min_array_f((rmaps)->pixels,LAYERS_SIZE(rmaps))>=0 && \ max_array_f((rmaps)->pixels,LAYERS_SIZE(rmaps))<=1.001) //CHECK_MAPS(res); return nconv; }