/*
Copyright (C) 2014 Jerome Revaud
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see
*/
#include "conv.h"
#include "std.h"
#include "omp.h"
#include "maxfilter.h"
extern "C" {
#include
#define integer int
#define real float
extern int saxpy_(integer *n, real *sa, real *sx, integer *incx, real *sy, integer *incy);
extern int sscal_(integer *n, real *sa, real *sx, integer *incx);
}
static inline void fast_set_val( float * __restrict__ a, long d, const float val) {
if(val) {
int j;
for(j=0; j>2;
// while(e--) *a4++ *= (*divi4);
int j;
for(j=0; jtx) dx=tx; // after those alues, nothing happens anyway
if(dy>ty) dy=ty;
if(-dx>tx) dx=-tx;
if(-dy>ty) dy=-ty;
#define add_default(n) {fast_set_val(dest,(n),mul*def); dest+=(n);}
float* _dest = dest;
// paste -v zeros rows
if(dy<0) add_default(-dy*(tx+ex));
src += MAX(0,dx);
const int row_len = MIN(tx,tx+dx+ex) - MAX(0,dx);
int j;
for(j=MAX(0,dy); j=0) {add_default(dx)
if(ex) add_default(ex)}
}
// paste +v zeros rows
if(dy>=0){add_default(dy*(tx+ex))
if(ex) add_default(ex*(tx+ex))}
#undef add_default
assert( dest-_dest == (tx+ex)*(ty+ex) );
return dest;
}
static inline float* fast_add_trans( float * dest, const float * src, const float mul,
int dx, int dy, const int tx, const int ty, const int ex, const float def ) {
if(mul==0) return dest+(tx+ex)*(ty+ex);
if(dx>tx) dx=tx; // after those alues, nothing happens anyway
if(dy>ty) dy=ty;
if(-dx>tx) dx=-tx;
if(-dy>ty) dy=-ty;
#define add_default(n) {fast_add_val(dest,n,def*mul); dest+=n;}
float* _dest = dest;
// paste -v zeros rows
if(dy<0) add_default(-dy*(tx+ex));
src += MAX(0,dx);
const int row_len = MIN(tx,tx+dx+ex) - MAX(0,dx);
int j;
for(j=MAX(0,dy); j=0) {add_default(dx)
if(ex) add_default(ex)}
}
// paste +v zeros rows
if(dy>=0){add_default(dy*(tx+ex))
if(ex) add_default(ex*(tx+ex))}
#undef add_default
assert( dest-_dest == (tx+ex)*(ty+ex) );
return dest;
}
static inline void norm_norm( float* norms, int nb, float mode ) {
int i;
if( mode < 0 )
assert(!"error: unknown norm mode");
else if( mode == 0.5 ) {
for(i=0; i 1 )
assert(!"error: unknown norm mode");
}
/* normalize each pixel of a multi-layers image
norm = {0:nothing, 1:L2-normalization, 0-1: normalization by (L2-norm)** }
*/
void norm_layers( float_layers* res, float norm, int n_thread ) {
if(norm==0) return;
const int layer_size = res->tx*res->ty;
const int n_layers = res->tz;
float* norms = NEWAC(float,layer_size);
long l;
for(l=0; lpixels + l*layer_size;
int i;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(i=0; ipixels + l*layer_size;
int i;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(i=0; itz; // number of dimensions of an atomic patch descriptor
}
/* Sample a set of patches from a HOG image.
grid : array of (x,y) position of the patches
size: size of the patches, ie. [x,x+size[ x [y,y+size[
res: result array, n_patches x desc_dim
desc_dim = n_layers * size**2
norms: result, n_patches x 1, norm of each patch
*/
void _sample_patches( float_layers* hog, float_layers* color, int_image* grid, int size, float norm,
float_image* res, float_array* norms, int n_thread ) {
const int tx = hog->tx;
const long npix = tx*hog->ty;
assert( grid->tx == 2 );
const int n_patches = grid->ty;
assert( res->ty == n_patches );
const int n_layers = hog->tz;
const int n_colors = (color? color->tz: 0);
const int color_npix = (color? color->tx*color->ty: 0);
const int desc_size = size*size*n_layers + (color? color->tz: 0);
assert(res->tx == desc_size );
int n;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(n=0; npixels + desc_size*n;
int *p = grid->pixels + 2*n;
// copy hog
int x=p[0],y=p[1];
assert(0<=x && x+size<=tx);
assert(0<=y && y+size<=hog->ty);
int l,j;
for(l=0; lpixels + l*npix + y*tx + x;
for(j=0; jpixels + (y+size/2)*color->ty + (x+size/2);
for(l=0; lpixels : NEWAC(float, n_patches);
if(norms) {
assert(norms->tx==n_patches);
memset(normp,0,n_patches*sizeof(float));
}
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(n=0; npixels + desc_size*n;
int l;
for(l=0; lpixels + desc_size*n;
int l;
float nn = normp[n]+1e-8;
for(l=0; lpixels[0];
const int step0 = child_grid->tx==1 && child_grid->ty==1 ? 1 :
MAX( child_grid->pixels[2]-child_grid->pixels[0],
child_grid->pixels[1+2*child_grid->tx]-child_grid->pixels[1] );
int i = (x-size0_div2)/step0;
int j = (y-size0_div2)/step0;
assert( x==(i*step0+size0_div2) || !"error: child_grid does not match current grid" );
assert( y==(j*step0+size0_div2) || !"error: child_grid does not match current grid" );
if( i<0 || i>=child_grid->tx ) return -1;
if( j<0 || j>=child_grid->ty ) return -1;
return i+j*child_grid->tx;
}
/* Prepare a grid of cell positions in the first image for a given scale. Big cells inherit the cell at the previous scale.
size = size of cells at current scale
offset, step = grid generator: (offset + i*step, offset + j*step)
child_grid = grid of the previous layer (or None if first layer)
child_norms = image containing the norms of the patch at the previous level
grid = result center positions of cells in current scale
children = index of cells in previous scale used to construct big cells
norms = norms of the cells of this level
*/
void _prepare_big_cells( int size, int offset, int step,
int_cube* child_grid, float_image* child_norms,
int_cube* grid, int_cube* children, float_image* norms ) {
assert(grid->tz==2);
const int ntx = grid->tx; // should be == 1+(tx-size)/step so that patches do not pass the border
const int nty = grid->ty; // should be == 1+(ty-size)/step so that patches do not pass the border
/* grid[i,j] = ( offset + i*step, offset + j*step )
connection between two scales:
x cell position in lower scale == x position of children in upper scale
child_offset + child_i*child_step = offset + i*step + (2*u/(nc-1)-1)*size/4
*/
int i,j,u,v;
int* r = grid->pixels;
if( !child_grid ) {
// this is the first scale:
// we just return a grid of step size*(1-overlap/2) in [0, tx[ x [0, ty[
for(j=0; jtz==2);
ASSERT_SAME_SIZE( child_grid, child_norms );
assert( children );
const int nc = sqrt(children->tz); // number of children per row or col
assert( children->tz==pow2(nc) );
ASSERT_SAME_SIZE( grid, children );
ASSERT_SAME_SIZE( grid, norms );
// this is at least second scale
// we return a grid of step size*(1-overlap/2) in [0, tx[ x [0, ty[
const int quarter = size/4;
assert(4*quarter==size);
int* c = children->pixels;
float *n = norms->pixels;
memset(n,0,ntx*nty*sizeof(float));
for(j=0; j=0) *n += child_norms->pixels[*c];
}
n++;
}
}
}
/* Prepare image for dotprod : dot(patches, res)
where patches is n_patches x patch_dim
set outside of the image to be equal to (0,...,ninth_val)
*/
void _prepare_dotprod_convolution( float_layers* img, int patch_size, float ninth_val, int extend,
float_layers* res, int n_thread ) {
assert( img->tx+extend == res->tx );
assert( img->ty+extend == res->ty );
const int n_layers = img->tz;
const int tx = img->tx;
const int ty = img->ty;
const int npix = tx*ty;
const int npixex = (tx+extend)*(ty+extend);
assert( res->tz==patch_size*patch_size*img->tz );
long l;
const int first_half = patch_size/2; // half-size
const int second_half = patch_size - first_half;
const int layer_size = patch_size*patch_size*npixex;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(l=0; lpixels + l*npix;
float* r = res->pixels + l*layer_size;
int u,v;
// copy translated version of the image into res
for(v=-first_half; vtx+extend; // extend a bit the image
const int ety = hog->ty+extend;
float_layers* res = NEW(float_layers);
*res = empty_layers(float,etx,ety,nh);
float ninth_val = 0;
_prepare_dotprod_convolution( hog, patch_size, ninth_val, extend, res, nt );
if( norm ) norm_layers( res, norm, nt );
return res;
}
inline float sum_array_f(const float* a, int n) {
int i=n;
double res = 0;
while(i--) res+=a[i];
return (float)res;
}
extern "C" {
int sgemm_(char *transa, char *transb, integer *m, integer *
n, integer *k, float *alpha, float *a, integer *lda, float *b, integer *
ldb, float *beta, float *c, integer *ldc);
}
/* matrix-matrix multiplication with several SGEMM (each is single-threaded)
res = dot(patches, convolved_hog)
P*npix P * nh nh * npix
*/
void _dotprod( float_image* patches, float_layers* convolved_hog, float_layers* res, int n_thread ) {
int nh = patches->tx;
assert( nh == convolved_hog->tz );
ASSERT_SAME_IMG_SIZE( convolved_hog, res );
int P = patches->ty;
assert( res->tz == P );
int threadP = 1 + (P-1) / n_thread; // how many patches per thread
int npix = (int)IMG_SIZE(convolved_hog);
int l;
#if (defined(USE_OPENMP) && !defined(MULTITHREADED_BLAS))
#pragma omp parallel for num_threads(n_thread)
#else
n_thread = 1; // BLAS is already multithreaded
threadP = P;
#endif
for(l=0; lpixels + nh*start;
float* r = res->pixels + npix*start;
// blas fast matrix-matrix product
char T='n'; float alpha = 1, beta = 0;
sgemm_( &T, &T, &npix, &np, &nh, &alpha,
convolved_hog->pixels, &npix,
p, &nh, &beta, r, &npix);
}
}
inline void transpose_scalar_block(const float *A, float *B, const int lda, const int ldb,
const int block_row, const int block_col) {
for(int i=0; ity, m = A->tx;
assert( n==B->tx && m==B->ty );
const int block_size = 16;
const float* pA = A->pixels;
float* pB = B->pixels;
#ifdef USE_OPENMP
#pragma omp parallel for num_threads(nt)
#endif
for(int i=0; itx;
assert( nh == convolved_hog->tz );
const int P = patches->ty;
assert( IMG_SIZE(grid)==P && grid->tz==2 );
const int tx = convolved_hog->tx;
const int ty = convolved_hog->ty;
// neighborhood size
int res_tx = MIN(tx,2*ngh_rad);
int res_ty = MIN(ty,2*ngh_rad);
assert(res_txpixels || !"error: ran out of memory before sgemm");
*offsets = empty_image(int, 2, P);
char T='t'; float alpha=1, beta=0; int one=1;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(int j=0; jpixels + l*nh;
float* r = res_out->pixels + l*res_npix;
int left = MAX(0, MIN(grid->pixels[2*l+0] - ngh_rad, tx-2*ngh_rad));
int top = MAX(0, MIN(grid->pixels[2*l+1] - ngh_rad, ty-2*ngh_rad));
if(j==0) {
offsets->pixels[2*l+0] = left;
offsets->pixels[2*l+1] = top;
}
float* c = convolved_hog->pixels + (left + top*tx)*nh;
// blas fast matrix-vector product
sgemv_( &T, &nh, &res_tx, &alpha, c + j*tx*nh, &nh,
p, &one, &beta, r + j*res_tx, &one);
}
}
}
/* correct the convolution on the boundaries of the image
ttx, tty: true shape of the res_map (in case of using offsets)
*/
void rectify_conv( int patch_size, int nori, float_image* patches, int_image* offsets,
const int ttx, const int tty, int extend, float_layers* res, int n_thread ) {
const int n_patches = patches->ty;
assert( n_patches == res->tz );
//const int nori = patches->tx/pow2(patch_size);
assert( patches->tx >= nori*pow2(patch_size) );
const int tx = res->tx; // real true shape because it has been extended
const int ty = res->ty;
const int first_half = patch_size/2;
const int second_half = patch_size - first_half; // in case patch_size is odd
assert( offsets || (ttx==tx && tty==ty) );
assert( !offsets || (ttx>=tx && tty>=ty) );
assert( !offsets || (offsets->ty==res->tz && offsets->tx==2) );
const long npix = IMG_SIZE(res);
int l;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(l=0; lpixels[2*l+0] : 0;
const int offj = offsets ? offsets->pixels[2*l+1] : 0;
float sums[8]; // temporary norm of columns or rows
assert( patch_size <= (int)(sizeof(sums)/sizeof(sums[0])) );
int o,i,j;
// horizontal boundaries
memset(sums,0,sizeof(sums));
float* p = patches->pixels + l*patches->tx;
for(o=0; opixels + l*npix + (j-offj)*tx;
for(i=0; ipixels + l*npix + (j-offj)*tx;
for(i=0; ipixels + l*patches->tx;
for(o=0; opixels + l*npix + (i-offi);
for(j=0; jpixels + l*npix + (i-offi);
for(j=0; jpixels + l*npix;
for(j=offj; j0, the correlation is only computed in a small local neighborhood
(whose size is parameterized by ngh_rad).
if extend: width and height of output maps are extended
if norm: correlation are normalized afterwards.
*/
void fastconv( float_image* patches, float_layers* hog, int patch_size, int ngh_rad,
int extend, float norm, int nt, res_scale* res ) {
assert(0<=extend and extend<=1);
float_layers* convolved_hog = prepare_dotprod_convolution( hog, patch_size, extend, norm, nt );
assert( patches->tx==convolved_hog->tz);
res->true_shape[0] = convolved_hog->tx;
res->true_shape[1] = convolved_hog->ty;
//hash_layers(convolved_hog)
int_image* offsets = NULL;
if( ngh_rad == 0 ) { // no limit on translation
// allocate result
res->res_map = empty_layers(float, convolved_hog->tx, convolved_hog->ty, patches->ty);
assert(res->res_map.pixels || !"error: ran out of memory before sgemm");
// multi-threaded fast matrix product
_dotprod( patches, convolved_hog, &res->res_map, nt );
} else { // ngh_rad>0: cropping res_map
offsets = &res->offsets;
// transpose hog: _dotprod is much faster this way
float_cube convolved_hog_T = empty_cube(float, convolved_hog->tx, convolved_hog->ty, convolved_hog->tz);
{ float_image A = reshape_xy_z(float, convolved_hog); // cast to 2D matrix without copy
float_image B = reshape_z_xy(float, &convolved_hog_T);
transpose_matrix( &A, &B, nt);
}
//hash_cube(&convolved_hog_T)
// resized grid
int_cube fgrid = cube_like(int, &res->grid);
for(int i=0; igrid.pixels[i]/res->f;
//hash_cube(&fgrid)
// multi-threaded fast matrix product
_dotprod_ngh_rad_T( &fgrid, patches, ngh_rad, &convolved_hog_T, &res->res_map, offsets, nt );
free(fgrid.pixels);
free(convolved_hog_T.pixels);
//hash_image(offsets)
}
free_layers(convolved_hog);
// correct border effects on the correlation maps
rectify_conv( patch_size, hog->tz, patches, offsets, res->true_shape[0], res->true_shape[1],
extend, &res->res_map, nt );
}
/* Compute: arr **= p
*/
void fastipow( float_layers* arr, const float p, int n_thread ) {
const int n_layers = arr->tz;
const long npix = arr->tx*arr->ty;
int l;
// optimization: precompute some values of pow(x,p)
const int npc = 64;
float precom[npc+1];
for(l=0; l<=npc; l++) precom[l]= pow(l/(float)npc,p);
const float maxindex = npc - 0.001;
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(l=0; lpixels + l*npix;
int i;
for(i=0; i=0 && vmaxindex) v=maxindex;
int n = int(v);
float w = v-n;
a[i] = (1-w)*precom[n] + w*precom[n+1];
}
}
}
/* Compute: arr = max(0,(arr-p)/(1-p))
*/
void fasthinge( float_layers* arr, const float p, int n_thread ) {
const int n_layers = arr->tz;
const long npix = arr->tx*arr->ty;
int l;
const float f = 1/(1-p);
#if defined(USE_OPENMP)
#pragma omp parallel for num_threads(n_thread)
#endif
for(l=0; lpixels + l*npix;
int i;
for(i=0; ires) res=a[i];
return res;
}
/* Normalize weights in border areas of width .
There are 9 areas: top-left, top-middle, top-right, ..., bottom-right.
sum_divf indicates the current weight in those areas, i.e. values in the area
should be divided by the weight. But trans_inv allow to control the amount of
normalization: 0=no normalization, 1=normal
*/
static inline void normalize_trans(const int tx, const int ty, const int gap, float* rmap,
const float trans_inv, float sum_divf[9] ) {
if( trans_inv == 0 ) return;
int i,j;
for(i=0; i<9; i++) {
if( sum_divf[i]>0 )
sum_divf[i] = 1/pow(sum_divf[i], trans_inv); // if trans_inv==1, no effect
}
for(j=0; j on