You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
989 lines
32 KiB
989 lines
32 KiB
/*
|
|
Copyright (C) 2014 Jerome Revaud
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU General Public License as published by
|
|
the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU General Public License for more details.
|
|
|
|
You should have received a copy of the GNU General Public License
|
|
along with this program. If not, see <http://www.gnu.org/licenses/>
|
|
*/
|
|
#include "conv.h"
|
|
#include "std.h"
|
|
#include "omp.h"
|
|
#include "maxfilter.h"
|
|
|
|
extern "C" {
|
|
#include <immintrin.h>
|
|
#define integer int
|
|
#define real float
|
|
extern int saxpy_(integer *n, real *sa, real *sx, integer *incx, real *sy, integer *incy);
|
|
extern int sscal_(integer *n, real *sa, real *sx, integer *incx);
|
|
}
|
|
|
|
|
|
static inline void fast_set_val( float * __restrict__ a, long d, const float val) {
|
|
if(val) {
|
|
int j;
|
|
for(j=0; j<d; j++)
|
|
a[j] = val;
|
|
} else
|
|
memset(a,0,d*sizeof(float));
|
|
}
|
|
static inline void fast_add_val( float * __restrict__ a, long d, const float val) {
|
|
int j;
|
|
for(j=0; j<d; j++)
|
|
a[j] += val;
|
|
}
|
|
static inline void fast_set_vec( float * __restrict__ dest,
|
|
const float * __restrict__ src, int d, const float mul) {
|
|
if( mul==1)
|
|
memcpy(dest,src,d*sizeof(float));
|
|
else {
|
|
int j;
|
|
for(j=0; j<d; j++)
|
|
dest[j] = mul*src[j];
|
|
}
|
|
}
|
|
static inline void fast_add_vec( float * __restrict__ dest,
|
|
const float * __restrict__ add, int d, float mul) {
|
|
if(d<=4) {
|
|
int j;
|
|
for(j=0; j<d; j++)
|
|
dest[j] += mul*add[j];
|
|
} else {
|
|
int inc = 1;
|
|
saxpy_( &d, &mul, (float*)add, &inc, (float*)dest, &inc );
|
|
}
|
|
}
|
|
static inline void fast_div( float * __restrict__ a, long d, const float div) {
|
|
const float divi = 1/div;
|
|
// assert( ((long)a & 15) == 0 && (d & 3) == 0 );
|
|
// const float _divi4[] = {divi,divi,divi,divi};
|
|
// __v4sf *a4 = (__v4sf*)a;
|
|
// __v4sf *divi4 = (__v4sf*)_divi4;
|
|
// int e = d>>2;
|
|
// while(e--) *a4++ *= (*divi4);
|
|
int j;
|
|
for(j=0; j<d; j++)
|
|
a[j] *= divi;
|
|
}
|
|
|
|
static inline float* fast_set_trans( float * dest, const float * src, const float mul,
|
|
int dx, int dy, const int tx, const int ty, const int ex, const float def ) {
|
|
if(mul==0) {
|
|
memset(dest,0,sizeof(float)*(tx+ex)*(ty+ex));
|
|
return dest+(tx+ex)*(ty+ex);
|
|
}
|
|
if(dx>tx) dx=tx; // after those alues, nothing happens anyway
|
|
if(dy>ty) dy=ty;
|
|
if(-dx>tx) dx=-tx;
|
|
if(-dy>ty) dy=-ty;
|
|
|
|
#define add_default(n) {fast_set_val(dest,(n),mul*def); dest+=(n);}
|
|
float* _dest = dest;
|
|
|
|
// paste -v zeros rows
|
|
if(dy<0) add_default(-dy*(tx+ex));
|
|
|
|
src += MAX(0,dx);
|
|
const int row_len = MIN(tx,tx+dx+ex) - MAX(0,dx);
|
|
int j;
|
|
for(j=MAX(0,dy); j<MIN(ty,ty+dy+ex); j++) {
|
|
|
|
// paste -u zeros cols
|
|
if(dx<0) add_default(-dx);
|
|
|
|
// past image
|
|
fast_set_vec(dest,src+j*tx,row_len,mul);
|
|
dest += row_len;
|
|
|
|
// paste +u zeros cols
|
|
if(dx>=0) {add_default(dx)
|
|
if(ex) add_default(ex)}
|
|
}
|
|
|
|
// paste +v zeros rows
|
|
if(dy>=0){add_default(dy*(tx+ex))
|
|
if(ex) add_default(ex*(tx+ex))}
|
|
|
|
#undef add_default
|
|
assert( dest-_dest == (tx+ex)*(ty+ex) );
|
|
return dest;
|
|
}
|
|
|
|
static inline float* fast_add_trans( float * dest, const float * src, const float mul,
|
|
int dx, int dy, const int tx, const int ty, const int ex, const float def ) {
|
|
if(mul==0) return dest+(tx+ex)*(ty+ex);
|
|
if(dx>tx) dx=tx; // after those alues, nothing happens anyway
|
|
if(dy>ty) dy=ty;
|
|
if(-dx>tx) dx=-tx;
|
|
if(-dy>ty) dy=-ty;
|
|
#define add_default(n) {fast_add_val(dest,n,def*mul); dest+=n;}
|
|
float* _dest = dest;
|
|
|
|
// paste -v zeros rows
|
|
if(dy<0) add_default(-dy*(tx+ex));
|
|
|
|
src += MAX(0,dx);
|
|
const int row_len = MIN(tx,tx+dx+ex) - MAX(0,dx);
|
|
int j;
|
|
for(j=MAX(0,dy); j<MIN(ty,ty+dy+ex); j++) {
|
|
|
|
// paste -u zeros cols
|
|
if(dx<0) add_default(-dx);
|
|
|
|
// past image
|
|
fast_add_vec(dest,src+j*tx,row_len,mul);
|
|
dest += row_len;
|
|
|
|
// paste +u zeros cols
|
|
if(dx>=0) {add_default(dx)
|
|
if(ex) add_default(ex)}
|
|
}
|
|
|
|
// paste +v zeros rows
|
|
if(dy>=0){add_default(dy*(tx+ex))
|
|
if(ex) add_default(ex*(tx+ex))}
|
|
|
|
#undef add_default
|
|
assert( dest-_dest == (tx+ex)*(ty+ex) );
|
|
return dest;
|
|
}
|
|
|
|
|
|
static inline void norm_norm( float* norms, int nb, float mode ) {
|
|
int i;
|
|
if( mode < 0 )
|
|
assert(!"error: unknown norm mode");
|
|
else if( mode == 0.5 ) {
|
|
for(i=0; i<nb; i++)
|
|
norms[i] = sqrt(sqrt(norms[i]));
|
|
} else if( mode < 1 ) {
|
|
mode *= 0.5; // cumulate with initial 1/sqrt(.)
|
|
for(i=0; i<nb; i++)
|
|
norms[i] = pow(norms[i], mode);
|
|
} else if( mode == 1 ) {
|
|
for(i=0; i<nb; i++)
|
|
norms[i] = sqrt(norms[i]);
|
|
} else if( mode > 1 )
|
|
assert(!"error: unknown norm mode");
|
|
}
|
|
|
|
|
|
/* normalize each pixel of a multi-layers image
|
|
norm = {0:nothing, 1:L2-normalization, 0-1: normalization by (L2-norm)**<norm> }
|
|
*/
|
|
void norm_layers( float_layers* res, float norm, int n_thread ) {
|
|
if(norm==0) return;
|
|
|
|
const int layer_size = res->tx*res->ty;
|
|
const int n_layers = res->tz;
|
|
float* norms = NEWAC(float,layer_size);
|
|
long l;
|
|
|
|
for(l=0; l<n_layers; l++) {
|
|
float* r = res->pixels + l*layer_size;
|
|
int i;
|
|
#if defined(USE_OPENMP)
|
|
#pragma omp parallel for num_threads(n_thread)
|
|
#endif
|
|
for(i=0; i<layer_size; i++)
|
|
norms[i] += r[i]*r[i];
|
|
}
|
|
norm_norm( norms, layer_size, norm );
|
|
|
|
for(l=0; l<n_layers; l++) {
|
|
float* r = res->pixels + l*layer_size;
|
|
int i;
|
|
#if defined(USE_OPENMP)
|
|
#pragma omp parallel for num_threads(n_thread)
|
|
#endif
|
|
for(i=0; i<layer_size; i++)
|
|
r[i] /= norms[i]+1e-8;
|
|
}
|
|
|
|
free(norms);
|
|
}
|
|
|
|
|
|
/* Return the vectorized dimension of a HOG patch
|
|
*/
|
|
int get_patch_desc_dim( float_layers* hog, int patch_size )
|
|
{
|
|
return patch_size*patch_size * hog->tz; // number of dimensions of an atomic patch descriptor
|
|
}
|
|
|
|
|
|
/* Sample a set of patches from a HOG image.
|
|
grid : array of (x,y) position of the patches
|
|
size: size of the patches, ie. [x,x+size[ x [y,y+size[
|
|
res: result array, n_patches x desc_dim
|
|
desc_dim = n_layers * size**2
|
|
norms: result, n_patches x 1, norm of each patch
|
|
*/
|
|
void _sample_patches( float_layers* hog, float_layers* color, int_image* grid, int size, float norm,
|
|
float_image* res, float_array* norms, int n_thread ) {
|
|
const int tx = hog->tx;
|
|
const long npix = tx*hog->ty;
|
|
assert( grid->tx == 2 );
|
|
const int n_patches = grid->ty;
|
|
assert( res->ty == n_patches );
|
|
const int n_layers = hog->tz;
|
|
const int n_colors = (color? color->tz: 0);
|
|
const int color_npix = (color? color->tx*color->ty: 0);
|
|
const int desc_size = size*size*n_layers + (color? color->tz: 0);
|
|
assert(res->tx == desc_size );
|
|
|
|
int n;
|
|
|
|
#if defined(USE_OPENMP)
|
|
#pragma omp parallel for num_threads(n_thread)
|
|
#endif
|
|
for(n=0; n<n_patches; n++) {
|
|
float *r = res->pixels + desc_size*n;
|
|
int *p = grid->pixels + 2*n;
|
|
// copy hog
|
|
int x=p[0],y=p[1];
|
|
assert(0<=x && x+size<=tx);
|
|
assert(0<=y && y+size<=hog->ty);
|
|
int l,j;
|
|
for(l=0; l<n_layers; l++) {
|
|
float* h = hog->pixels + l*npix + y*tx + x;
|
|
for(j=0; j<size; j++) {
|
|
memcpy(r, h, size*sizeof(float));
|
|
h += tx;
|
|
r += size;
|
|
}
|
|
}
|
|
if(!color) continue;
|
|
// copy color
|
|
float* c = color->pixels + (y+size/2)*color->ty + (x+size/2);
|
|
for(l=0; l<n_colors; l++)
|
|
*r++ = c[l*color_npix];
|
|
}
|
|
|
|
if(norm) {
|
|
float* normp = norms ? norms->pixels : NEWAC(float, n_patches);
|
|
if(norms) {
|
|
assert(norms->tx==n_patches);
|
|
memset(normp,0,n_patches*sizeof(float));
|
|
}
|
|
|
|
#if defined(USE_OPENMP)
|
|
#pragma omp parallel for num_threads(n_thread)
|
|
#endif
|
|
for(n=0; n<n_patches; n++) {
|
|
float *r = res->pixels + desc_size*n;
|
|
int l;
|
|
for(l=0; l<desc_size; l++)
|
|
normp[n] += r[l]*r[l];
|
|
}
|
|
norm_norm( normp, n_patches, norm );
|
|
|
|
#if defined(USE_OPENMP)
|
|
#pragma omp parallel for num_threads(n_thread)
|
|
#endif
|
|
for(n=0; n<n_patches; n++) {
|
|
float *r = res->pixels + desc_size*n;
|
|
int l;
|
|
float nn = normp[n]+1e-8;
|
|
for(l=0; l<desc_size; l++)
|
|
r[l] /= nn;
|
|
}
|
|
|
|
if(!norms) free(normp);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline int retrieve_children( const int x, const int y, const int_cube* child_grid ) {
|
|
const int size0_div2 = child_grid->pixels[0];
|
|
const int step0 = child_grid->tx==1 && child_grid->ty==1 ? 1 :
|
|
MAX( child_grid->pixels[2]-child_grid->pixels[0],
|
|
child_grid->pixels[1+2*child_grid->tx]-child_grid->pixels[1] );
|
|
int i = (x-size0_div2)/step0;
|
|
int j = (y-size0_div2)/step0;
|
|
assert( x==(i*step0+size0_div2) || !"error: child_grid does not match current grid" );
|
|
assert( y==(j*step0+size0_div2) || !"error: child_grid does not match current grid" );
|
|
if( i<0 || i>=child_grid->tx ) return -1;
|
|
if( j<0 || j>=child_grid->ty ) return -1;
|
|
return i+j*child_grid->tx;
|
|
}
|
|
|
|
/* Prepare a grid of cell positions in the first image for a given scale. Big cells inherit the cell at the previous scale.
|
|
size = size of cells at current scale
|
|
offset, step = grid generator: (offset + i*step, offset + j*step)
|
|
child_grid = grid of the previous layer (or None if first layer)
|
|
child_norms = image containing the norms of the patch at the previous level
|
|
grid = result center positions of cells in current scale
|
|
children = index of cells in previous scale used to construct big cells
|
|
norms = norms of the cells of this level
|
|
*/
|
|
void _prepare_big_cells( int size, int offset, int step,
|
|
int_cube* child_grid, float_image* child_norms,
|
|
int_cube* grid, int_cube* children, float_image* norms ) {
|
|
assert(grid->tz==2);
|
|
const int ntx = grid->tx; // should be == 1+(tx-size)/step so that patches do not pass the border
|
|
const int nty = grid->ty; // should be == 1+(ty-size)/step so that patches do not pass the border
|
|
|
|
/* grid[i,j] = ( offset + i*step, offset + j*step )
|
|
|
|
connection between two scales:
|
|
x cell position in lower scale == x position of children in upper scale
|
|
child_offset + child_i*child_step = offset + i*step + (2*u/(nc-1)-1)*size/4
|
|
*/
|
|
|
|
int i,j,u,v;
|
|
int* r = grid->pixels;
|
|
|
|
if( !child_grid ) {
|
|
// this is the first scale:
|
|
// we just return a grid of step size*(1-overlap/2) in [0, tx[ x [0, ty[
|
|
|
|
for(j=0; j<nty; j++)
|
|
for(i=0; i<ntx; i++) {
|
|
*r++ = offset + i*step;
|
|
*r++ = offset + j*step;
|
|
}
|
|
} else {
|
|
assert(child_grid->tz==2);
|
|
ASSERT_SAME_SIZE( child_grid, child_norms );
|
|
assert( children );
|
|
const int nc = sqrt(children->tz); // number of children per row or col
|
|
assert( children->tz==pow2(nc) );
|
|
ASSERT_SAME_SIZE( grid, children );
|
|
ASSERT_SAME_SIZE( grid, norms );
|
|
// this is at least second scale
|
|
// we return a grid of step size*(1-overlap/2) in [0, tx[ x [0, ty[
|
|
|
|
const int quarter = size/4;
|
|
assert(4*quarter==size);
|
|
int* c = children->pixels;
|
|
float *n = norms->pixels;
|
|
memset(n,0,ntx*nty*sizeof(float));
|
|
for(j=0; j<nty; j++)
|
|
for(i=0; i<ntx; i++) {
|
|
int x = offset + i*step;
|
|
int y = offset + j*step;
|
|
*r++ = x;
|
|
*r++ = y;
|
|
|
|
// accumulate norms from 2x2 or 3x3 neighbors
|
|
for(v=0; v<nc; v++)
|
|
for(u=0; u<nc; u++,c++) {
|
|
// we want to index the children at position:
|
|
// ( center_x + (2*u/(nc-1)-1)*size/4, center_y + (2*v/(nc-1)-1)*size/4 )
|
|
*c = retrieve_children( x+(2*u/(nc-1)-1)*quarter, y+(2*v/(nc-1)-1)*quarter, child_grid );
|
|
if(*c>=0) *n += child_norms->pixels[*c];
|
|
}
|
|
n++;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/* Prepare image for dotprod : dot(patches, res)
|
|
where patches is n_patches x patch_dim
|
|
set outside of the image to be equal to (0,...,ninth_val)
|
|
*/
|
|
void _prepare_dotprod_convolution( float_layers* img, int patch_size, float ninth_val, int extend,
|
|
float_layers* res, int n_thread ) {
|
|
assert( img->tx+extend == res->tx );
|
|
assert( img->ty+extend == res->ty );
|
|
const int n_layers = img->tz;
|
|
const int tx = img->tx;
|
|
const int ty = img->ty;
|
|
const int npix = tx*ty;
|
|
const int npixex = (tx+extend)*(ty+extend);
|
|
assert( res->tz==patch_size*patch_size*img->tz );
|
|
|
|
long l;
|
|
const int first_half = patch_size/2; // half-size
|
|
const int second_half = patch_size - first_half;
|
|
const int layer_size = patch_size*patch_size*npixex;
|
|
|
|
#if defined(USE_OPENMP)
|
|
#pragma omp parallel for num_threads(n_thread)
|
|
#endif
|
|
for(l=0; l<n_layers; l++) {
|
|
float* img_pix = img->pixels + l*npix;
|
|
float* r = res->pixels + l*layer_size;
|
|
int u,v;
|
|
// copy translated version of the image into res
|
|
for(v=-first_half; v<second_half; v++)
|
|
for(u=-first_half; u<second_half; u++)
|
|
r = fast_set_trans( r, img_pix, 1, u, v, tx, ty, extend, l+1<n_layers? 0 : ninth_val );
|
|
}
|
|
}
|
|
|
|
|
|
float_layers* prepare_dotprod_convolution( float_layers* hog, int patch_size, int extend, float norm, int nt )
|
|
{
|
|
assert(0<=extend and extend<=1);
|
|
const int nh = get_patch_desc_dim(hog,patch_size);
|
|
const int etx = hog->tx+extend; // extend a bit the image
|
|
const int ety = hog->ty+extend;
|
|
|
|
float_layers* res = NEW(float_layers);
|
|
*res = empty_layers(float,etx,ety,nh);
|
|
|
|
float ninth_val = 0;
|
|
_prepare_dotprod_convolution( hog, patch_size, ninth_val, extend, res, nt );
|
|
|
|
if( norm ) norm_layers( res, norm, nt );
|
|
return res;
|
|
}
|
|
|
|
|
|
inline float sum_array_f(const float* a, int n) {
|
|
int i=n;
|
|
double res = 0;
|
|
while(i--) res+=a[i];
|
|
return (float)res;
|
|
}
|
|
|
|
|
|
extern "C" {
|
|
int sgemm_(char *transa, char *transb, integer *m, integer *
|
|
n, integer *k, float *alpha, float *a, integer *lda, float *b, integer *
|
|
ldb, float *beta, float *c, integer *ldc);
|
|
}
|
|
|
|
/* matrix-matrix multiplication with several SGEMM (each is single-threaded)
|
|
res = dot(patches, convolved_hog)
|
|
P*npix P * nh nh * npix
|
|
*/
|
|
void _dotprod( float_image* patches, float_layers* convolved_hog, float_layers* res, int n_thread ) {
|
|
int nh = patches->tx;
|
|
assert( nh == convolved_hog->tz );
|
|
ASSERT_SAME_IMG_SIZE( convolved_hog, res );
|
|
int P = patches->ty;
|
|
assert( res->tz == P );
|
|
int threadP = 1 + (P-1) / n_thread; // how many patches per thread
|
|
int npix = (int)IMG_SIZE(convolved_hog);
|
|
|
|
int l;
|
|
#if (defined(USE_OPENMP) && !defined(MULTITHREADED_BLAS))
|
|
#pragma omp parallel for num_threads(n_thread)
|
|
#else
|
|
n_thread = 1; // BLAS is already multithreaded
|
|
threadP = P;
|
|
#endif
|
|
for(l=0; l<n_thread; l++) {
|
|
// we do dotprod( patches[l*threadP : (l+1)*threadP], convolved_hog )
|
|
long start = l*threadP;
|
|
long end = MIN(P,(l+1)*threadP);
|
|
int np = int(end - start);
|
|
float* p = patches->pixels + nh*start;
|
|
float* r = res->pixels + npix*start;
|
|
|
|
// blas fast matrix-matrix product
|
|
char T='n'; float alpha = 1, beta = 0;
|
|
sgemm_( &T, &T, &npix, &np, &nh, &alpha,
|
|
convolved_hog->pixels, &npix,
|
|
p, &nh, &beta, r, &npix);
|
|
}
|
|
}
|
|
|
|
inline void transpose_scalar_block(const float *A, float *B, const int lda, const int ldb,
|
|
const int block_row, const int block_col) {
|
|
for(int i=0; i<block_row; i++)
|
|
for(int j=0; j<block_col; j++)
|
|
B[j*ldb + i] = A[i*lda +j];
|
|
}
|
|
|
|
// Transpose A (N rows by M cols) into B (M by N)
|
|
void transpose_matrix(const float_image* A, float_image* B, int nt) {
|
|
const int n = A->ty, m = A->tx;
|
|
assert( n==B->tx && m==B->ty );
|
|
const int block_size = 16;
|
|
const float* pA = A->pixels;
|
|
float* pB = B->pixels;
|
|
|
|
#ifdef USE_OPENMP
|
|
#pragma omp parallel for num_threads(nt)
|
|
#endif
|
|
for(int i=0; i<n; i+=block_size)
|
|
for(int j=0; j<m; j+=block_size)
|
|
transpose_scalar_block(&pA[i*m +j], &pB[j*n + i], m, n, MIN(block_size, n-i), MIN(block_size, m-j));
|
|
}
|
|
|
|
extern "C" {
|
|
int sgemv_(char *transa, integer *m, integer * n,
|
|
float *alpha, float *a, integer *lda,
|
|
float *b, integer * ldb, float *beta,
|
|
float *c, integer * ldc);
|
|
}
|
|
|
|
/* convolution of each patch within a local neighborhood
|
|
ngh_rad = max translation
|
|
neighborhood has size 2*ngh_rad
|
|
patch at (x,y) is compared to patches in [y-ngh_rad : y+ngh_rad,
|
|
x-ngh_rad : y+ngh_rad]
|
|
*/
|
|
void _dotprod_ngh_rad_T( int_cube* grid, float_image* patches, int ngh_rad,
|
|
float_cube* convolved_hog, float_layers* res_out,
|
|
int_image* offsets, int n_thread ) {
|
|
int nh = patches->tx;
|
|
assert( nh == convolved_hog->tz );
|
|
const int P = patches->ty;
|
|
assert( IMG_SIZE(grid)==P && grid->tz==2 );
|
|
const int tx = convolved_hog->tx;
|
|
const int ty = convolved_hog->ty;
|
|
|
|
// neighborhood size
|
|
int res_tx = MIN(tx,2*ngh_rad);
|
|
int res_ty = MIN(ty,2*ngh_rad);
|
|
assert(res_tx<tx-1 || res_ty<ty-1 || !"ngh_rad is too large and results in loss of perf. Set ngh_rad=0 instead.");
|
|
int res_npix = res_tx * res_ty;
|
|
// allocate result
|
|
*res_out = empty_layers(float, res_tx, res_ty, P);
|
|
assert(res_out->pixels || !"error: ran out of memory before sgemm");
|
|
*offsets = empty_image(int, 2, P);
|
|
|
|
char T='t'; float alpha=1, beta=0; int one=1;
|
|
|
|
#if defined(USE_OPENMP)
|
|
#pragma omp parallel for num_threads(n_thread)
|
|
#endif
|
|
for(int j=0; j<res_ty; ++j) {
|
|
// By organizing loops this way,
|
|
// we exploit overlap between patches.
|
|
|
|
for(int l=0; l<P; l++) {
|
|
float* p = patches->pixels + l*nh;
|
|
float* r = res_out->pixels + l*res_npix;
|
|
int left = MAX(0, MIN(grid->pixels[2*l+0] - ngh_rad, tx-2*ngh_rad));
|
|
int top = MAX(0, MIN(grid->pixels[2*l+1] - ngh_rad, ty-2*ngh_rad));
|
|
if(j==0) {
|
|
offsets->pixels[2*l+0] = left;
|
|
offsets->pixels[2*l+1] = top;
|
|
}
|
|
float* c = convolved_hog->pixels + (left + top*tx)*nh;
|
|
|
|
// blas fast matrix-vector product
|
|
sgemv_( &T, &nh, &res_tx, &alpha, c + j*tx*nh, &nh,
|
|
p, &one, &beta, r + j*res_tx, &one);
|
|
}
|
|
}
|
|
}
|
|
|
|
/* correct the convolution on the boundaries of the image
|
|
ttx, tty: true shape of the res_map (in case of using offsets)
|
|
*/
|
|
void rectify_conv( int patch_size, int nori, float_image* patches, int_image* offsets,
|
|
const int ttx, const int tty, int extend, float_layers* res, int n_thread ) {
|
|
const int n_patches = patches->ty;
|
|
assert( n_patches == res->tz );
|
|
//const int nori = patches->tx/pow2(patch_size);
|
|
assert( patches->tx >= nori*pow2(patch_size) );
|
|
const int tx = res->tx; // real true shape because it has been extended
|
|
const int ty = res->ty;
|
|
const int first_half = patch_size/2;
|
|
const int second_half = patch_size - first_half; // in case patch_size is odd
|
|
assert( offsets || (ttx==tx && tty==ty) );
|
|
assert( !offsets || (ttx>=tx && tty>=ty) );
|
|
assert( !offsets || (offsets->ty==res->tz && offsets->tx==2) );
|
|
const long npix = IMG_SIZE(res);
|
|
|
|
int l;
|
|
#if defined(USE_OPENMP)
|
|
#pragma omp parallel for num_threads(n_thread)
|
|
#endif
|
|
for(l=0; l<n_patches; l++) {
|
|
// load offsets
|
|
const int offi = offsets ? offsets->pixels[2*l+0] : 0;
|
|
const int offj = offsets ? offsets->pixels[2*l+1] : 0;
|
|
|
|
float sums[8]; // temporary norm of columns or rows
|
|
assert( patch_size <= (int)(sizeof(sums)/sizeof(sums[0])) );
|
|
int o,i,j;
|
|
|
|
// horizontal boundaries
|
|
memset(sums,0,sizeof(sums));
|
|
float* p = patches->pixels + l*patches->tx;
|
|
for(o=0; o<nori; o++)
|
|
for(j=0; j<patch_size; j++)
|
|
for(i=0; i<patch_size; i++)
|
|
sums[j] += pow2(*p++);
|
|
|
|
float old_norm = sqrt(sum_array_f(sums,patch_size));
|
|
if( old_norm==0 ) continue;
|
|
|
|
// upper boundary
|
|
for(j=offj; j<first_half; j++) {
|
|
float new_norm = sqrt(sum_array_f(sums+(first_half-j),second_half+j)); // sums to patch_size
|
|
float mul = old_norm / (new_norm + 1e-8);
|
|
float* r = res->pixels + l*npix + (j-offj)*tx;
|
|
for(i=0; i<tx; i++) {
|
|
r[i] *= mul;
|
|
//assert(r[i]<1.1);
|
|
}
|
|
}
|
|
// lower boundary
|
|
for(j=tty-extend+1-second_half; j<offj+ty; j++) {
|
|
float new_norm = sqrt(sum_array_f(sums,first_half+tty-extend-j)); // sums to patch_size
|
|
float mul = old_norm / (new_norm + 1e-8);
|
|
float* r = res->pixels + l*npix + (j-offj)*tx;
|
|
for(i=0; i<tx; i++) {
|
|
r[i] *= mul;
|
|
//assert(r[i]<1.1);
|
|
}
|
|
}
|
|
|
|
// vertical boundaries
|
|
memset(sums,0,sizeof(sums));
|
|
p = patches->pixels + l*patches->tx;
|
|
for(o=0; o<nori; o++)
|
|
for(j=0; j<patch_size; j++)
|
|
for(i=0; i<patch_size; i++)
|
|
sums[i] += pow2(*p++);
|
|
|
|
// left boundary
|
|
for(i=offi; i<first_half; i++) {
|
|
float new_norm = sqrt(sum_array_f(sums+(first_half-i),second_half+i));
|
|
float mul = old_norm / (new_norm + 1e-8);
|
|
float* r = res->pixels + l*npix + (i-offi);
|
|
for(j=0; j<ty; j++) {
|
|
r[j*tx] *= mul;
|
|
//assert(r[j*tx]<1.1);
|
|
}
|
|
}
|
|
// right boundary
|
|
for(i=ttx-extend+1-second_half; i<offi+tx; i++) {
|
|
float new_norm = sqrt(sum_array_f(sums,first_half+ttx-extend-i));
|
|
float mul = old_norm / (new_norm + 1e-8);
|
|
float* r = res->pixels + l*npix + (i-offi);
|
|
for(j=0; j<ty; j++) {
|
|
r[j*tx] *= mul;
|
|
//assert(r[j*tx]<1.1);
|
|
}
|
|
}
|
|
|
|
// because we over-estimated the rectification for the corners, check that they do not overpass old_norm
|
|
float* r = res->pixels + l*npix;
|
|
for(j=offj; j<first_half; j++) {
|
|
for(i=offi; i<first_half; i++)
|
|
r[(j-offj)*tx+(i-offi)] = MIN(r[(j-offj)*tx+(i-offi)], old_norm);
|
|
for(i=ttx-extend+1-second_half; i<offi+tx; i++)
|
|
r[(j-offj)*tx+(i-offi)] = MIN(r[(j-offj)*tx+(i-offi)], old_norm);
|
|
}
|
|
for(j=tty-extend+1-second_half; j<offj+ty; j++) {
|
|
for(i=offi; i<first_half; i++)
|
|
r[(j-offj)*tx+(i-offi)] = MIN(r[(j-offj)*tx+(i-offi)], old_norm);
|
|
for(i=ttx-extend+1-second_half; i<offi+tx; i++)
|
|
r[(j-offj)*tx+(i-offi)] = MIN(r[(j-offj)*tx+(i-offi)], old_norm);
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/* Compute the correlation of all patches with the second image (hog).
|
|
In case of ngh_rad>0, the correlation is only computed in a small local neighborhood
|
|
(whose size is parameterized by ngh_rad).
|
|
if extend: width and height of output maps are extended
|
|
if norm: correlation are normalized afterwards.
|
|
*/
|
|
void fastconv( float_image* patches, float_layers* hog, int patch_size, int ngh_rad,
|
|
int extend, float norm, int nt, res_scale* res ) {
|
|
|
|
assert(0<=extend and extend<=1);
|
|
float_layers* convolved_hog = prepare_dotprod_convolution( hog, patch_size, extend, norm, nt );
|
|
assert( patches->tx==convolved_hog->tz);
|
|
res->true_shape[0] = convolved_hog->tx;
|
|
res->true_shape[1] = convolved_hog->ty;
|
|
//hash_layers(convolved_hog)
|
|
|
|
int_image* offsets = NULL;
|
|
if( ngh_rad == 0 ) { // no limit on translation
|
|
// allocate result
|
|
res->res_map = empty_layers(float, convolved_hog->tx, convolved_hog->ty, patches->ty);
|
|
assert(res->res_map.pixels || !"error: ran out of memory before sgemm");
|
|
|
|
// multi-threaded fast matrix product
|
|
_dotprod( patches, convolved_hog, &res->res_map, nt );
|
|
|
|
} else { // ngh_rad>0: cropping res_map
|
|
offsets = &res->offsets;
|
|
|
|
// transpose hog: _dotprod is much faster this way
|
|
float_cube convolved_hog_T = empty_cube(float, convolved_hog->tx, convolved_hog->ty, convolved_hog->tz);
|
|
{ float_image A = reshape_xy_z(float, convolved_hog); // cast to 2D matrix without copy
|
|
float_image B = reshape_z_xy(float, &convolved_hog_T);
|
|
transpose_matrix( &A, &B, nt);
|
|
}
|
|
//hash_cube(&convolved_hog_T)
|
|
|
|
// resized grid
|
|
int_cube fgrid = cube_like(int, &res->grid);
|
|
for(int i=0; i<CUBE_SIZE(&fgrid); i++)
|
|
fgrid.pixels[i] = res->grid.pixels[i]/res->f;
|
|
//hash_cube(&fgrid)
|
|
|
|
// multi-threaded fast matrix product
|
|
_dotprod_ngh_rad_T( &fgrid, patches, ngh_rad, &convolved_hog_T, &res->res_map, offsets, nt );
|
|
|
|
free(fgrid.pixels);
|
|
free(convolved_hog_T.pixels);
|
|
//hash_image(offsets)
|
|
}
|
|
free_layers(convolved_hog);
|
|
|
|
// correct border effects on the correlation maps
|
|
rectify_conv( patch_size, hog->tz, patches, offsets, res->true_shape[0], res->true_shape[1],
|
|
extend, &res->res_map, nt );
|
|
}
|
|
|
|
|
|
|
|
/* Compute: arr **= p
|
|
*/
|
|
void fastipow( float_layers* arr, const float p, int n_thread ) {
|
|
const int n_layers = arr->tz;
|
|
const long npix = arr->tx*arr->ty;
|
|
int l;
|
|
|
|
// optimization: precompute some values of pow(x,p)
|
|
const int npc = 64;
|
|
float precom[npc+1];
|
|
for(l=0; l<=npc; l++) precom[l]= pow(l/(float)npc,p);
|
|
const float maxindex = npc - 0.001;
|
|
|
|
#if defined(USE_OPENMP)
|
|
#pragma omp parallel for num_threads(n_thread)
|
|
#endif
|
|
for(l=0; l<n_layers; l++) {
|
|
float* a = arr->pixels + l*npix;
|
|
int i;
|
|
for(i=0; i<npix; i++) {
|
|
// arr[i] = pow(arr[i],p);
|
|
float v = a[i]*npc;
|
|
assert( v>=0 && v<npc+1 );
|
|
if(v>maxindex) v=maxindex;
|
|
int n = int(v);
|
|
float w = v-n;
|
|
a[i] = (1-w)*precom[n] + w*precom[n+1];
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Compute: arr = max(0,(arr-p)/(1-p))
|
|
*/
|
|
void fasthinge( float_layers* arr, const float p, int n_thread ) {
|
|
const int n_layers = arr->tz;
|
|
const long npix = arr->tx*arr->ty;
|
|
int l;
|
|
const float f = 1/(1-p);
|
|
|
|
#if defined(USE_OPENMP)
|
|
#pragma omp parallel for num_threads(n_thread)
|
|
#endif
|
|
for(l=0; l<n_layers; l++) {
|
|
float* a = arr->pixels + l*npix;
|
|
int i;
|
|
for(i=0; i<npix; i++) {
|
|
float v = a[i];
|
|
a[i] = MAX(0,f*(v-p));
|
|
}
|
|
}
|
|
}
|
|
|
|
inline int max_array_i(const int* a, int n) {
|
|
int i=n;
|
|
int res = INT_MIN;
|
|
while(i--) if(a[i]>res) res=a[i];
|
|
return res;
|
|
}
|
|
|
|
/* Normalize weights in border areas of width <gap>.
|
|
There are 9 areas: top-left, top-middle, top-right, ..., bottom-right.
|
|
sum_divf indicates the current weight in those areas, i.e. values in the area
|
|
should be divided by the weight. But trans_inv allow to control the amount of
|
|
normalization: 0=no normalization, 1=normal
|
|
*/
|
|
static inline void normalize_trans(const int tx, const int ty, const int gap, float* rmap,
|
|
const float trans_inv, float sum_divf[9] ) {
|
|
if( trans_inv == 0 ) return;
|
|
int i,j;
|
|
for(i=0; i<9; i++) {
|
|
if( sum_divf[i]>0 )
|
|
sum_divf[i] = 1/pow(sum_divf[i], trans_inv); // if trans_inv==1, no effect
|
|
}
|
|
for(j=0; j<gap; j++) {
|
|
if(sum_divf[0])
|
|
for(i=0; i<gap; i++)
|
|
rmap[j*tx+i] *= sum_divf[0];
|
|
if(sum_divf[1])
|
|
for(i=gap; i<tx-gap; i++)
|
|
rmap[j*tx+i] *= sum_divf[1];
|
|
if(sum_divf[2])
|
|
for(i=tx-gap; i<tx; i++)
|
|
rmap[j*tx+i] *= sum_divf[2];
|
|
}
|
|
for(; j<ty-gap; j++) {
|
|
if(sum_divf[3])
|
|
for(i=0; i<gap; i++)
|
|
rmap[j*tx+i] *= sum_divf[3];
|
|
if(sum_divf[5])
|
|
for(i=tx-gap; i<tx; i++)
|
|
rmap[j*tx+i] *= sum_divf[5];
|
|
}
|
|
for(; j<ty; j++) {
|
|
if(sum_divf[6])
|
|
for(i=0; i<gap; i++)
|
|
rmap[j*tx+i] *= sum_divf[6];
|
|
if(sum_divf[7])
|
|
for(i=gap; i<tx-gap; i++)
|
|
rmap[j*tx+i] *= sum_divf[7];
|
|
if(sum_divf[8])
|
|
for(i=tx-gap; i<tx; i++)
|
|
rmap[j*tx+i] *= sum_divf[8];
|
|
}
|
|
}
|
|
|
|
|
|
|
|
/* Compute the (sparse) convolutions specified by <children> on <map> and put the result in <res>.
|
|
A standard order is assumed on the children:
|
|
a response map #p is built from the children[p] at positions
|
|
[(gap*dx,gap*dy) for dy in dys for dx in dxs]
|
|
where dxs = [-1,1] or [-1,0,1]
|
|
dys = [-1,1] or [-1,0,1]
|
|
child_assign denote assignement of the children level, while assign is for the next level
|
|
child_norms contain the norms of small patches and norms for big new cells
|
|
*/
|
|
int _sparse_conv( int_image* children, int_array* child_assign, int gap, float trans_inv,
|
|
float_layers* child_map, int_image* offsets, float_array* child_norms, float_array* norms,
|
|
int_array* assign, float_layers* res, int_image* res_offsets, int n_thread ) {
|
|
const int nconv = children->ty; // number of convolutions to perform
|
|
const int nc2 = children->tx;
|
|
const int nc = sqrt(nc2);
|
|
assert( nc*nc == nc2 );
|
|
assert( res->tz == nconv );
|
|
const int tx = child_map->tx;
|
|
const int ty = child_map->ty;
|
|
const long npix = tx*ty;
|
|
ASSERT_SAME_SIZE( child_map, res );
|
|
const int n_lower_conv = max_array_i(children->pixels,nconv*nc2)+1;
|
|
int* cass = child_assign ? child_assign->pixels : NEWA(int,n_lower_conv);
|
|
if(!child_assign) {for(int i=0; i<n_lower_conv; i++) cass[i]=i;}
|
|
assert( !offsets || (offsets->pixels && offsets->tx==2 && offsets->ty==n_lower_conv &&
|
|
res_offsets && res_offsets->tx==2 && res_offsets->ty==nconv) );
|
|
|
|
if(assign) {
|
|
assert(0); // not supposed to happen
|
|
} else {
|
|
// normal case: no redundancy to exploit in response maps
|
|
|
|
int l;
|
|
#if defined(USE_OPENMP)
|
|
#pragma omp parallel for num_threads(n_thread)
|
|
#endif
|
|
for(l=0; l<nconv; l++) {
|
|
float *rmap = res->pixels + l*npix;
|
|
|
|
int u,v,c,ncall=0; // children number
|
|
const int* const child = children->pixels + l*nc2;
|
|
|
|
float sum_divf[9];
|
|
memset(sum_divf,0,sizeof(sum_divf));
|
|
int i,j;
|
|
|
|
// first, choose an offset for the result rmap from the child offsets
|
|
int offx=0, offy=0;
|
|
if( offsets ) {
|
|
int sum_ox=0, sum_oy=0, w=0;
|
|
for(c=v=0; v<nc; v++) {
|
|
int dy = (2*v/(nc-1)-1);
|
|
for(u=0; u<nc; u++,c++) {
|
|
int dx = (2*u/(nc-1)-1);
|
|
|
|
if(child[c]<0 || cass[child[c]]<0) continue;
|
|
|
|
sum_ox += offsets->pixels[2*child[c]+0] - dx*gap;
|
|
sum_oy += offsets->pixels[2*child[c]+1] - dy*gap;
|
|
w++;
|
|
}
|
|
}
|
|
if(w==0) w++; // just in case
|
|
offx = (int)floor(0.5 + sum_ox/float(w));
|
|
offy = (int)floor(0.5 + sum_oy/float(w));
|
|
|
|
// store result for later
|
|
res_offsets->pixels[2*l+0] = offx;
|
|
res_offsets->pixels[2*l+1] = offy;
|
|
}
|
|
|
|
for(c=v=0; v<nc; v++) {
|
|
int dy = (2*v/(nc-1)-1);
|
|
for(u=0; u<nc; u++,c++) {
|
|
int dx = (2*u/(nc-1)-1);
|
|
|
|
if(child[c]<0 || cass[child[c]]<0) continue;
|
|
float divf = child_norms->pixels[child[c]]/norms->pixels[l];
|
|
|
|
// difference with rmap's offset
|
|
const int trans_x = dx*gap + (offsets? offx - offsets->pixels[2*child[c]+0] : 0);
|
|
const int trans_y = dy*gap + (offsets? offy - offsets->pixels[2*child[c]+1] : 0);
|
|
|
|
// count the sum of weights in every image area
|
|
for(i=-1; i<=1; i++)for(j=-1; j<=1; j++)
|
|
if(i*trans_x<=0 && j*trans_y<=0)
|
|
sum_divf[4+j*3+i] += divf;
|
|
|
|
// add a translated version of map[children[c]] by (ox-dx,oy-dy)
|
|
if(ncall++==0) // first call
|
|
fast_set_trans( rmap, child_map->pixels + cass[child[c]]*npix, divf, trans_x,trans_y, tx,ty, 0, 0 );
|
|
else
|
|
fast_add_trans( rmap, child_map->pixels + cass[child[c]]*npix, divf, trans_x,trans_y, tx,ty, 0, 0 );
|
|
}
|
|
}
|
|
|
|
if( ncall == 0) // default = zeros
|
|
memset(rmap, 0, npix*sizeof(float));
|
|
|
|
// now we are supposed to rectify the boundaries (to perfect convolution)
|
|
normalize_trans(tx, ty, gap, rmap, trans_inv, sum_divf );
|
|
|
|
//assert(min_array_f(rmap,npix)>=0 && max_array_f(rmap,npix)<=1.001);
|
|
}
|
|
}
|
|
if(!child_assign) free(cass);
|
|
|
|
#define CHECK_MAPS(rmaps) assert(min_array_f((rmaps)->pixels,LAYERS_SIZE(rmaps))>=0 && \
|
|
max_array_f((rmaps)->pixels,LAYERS_SIZE(rmaps))<=1.001)
|
|
//CHECK_MAPS(res);
|
|
|
|
return nconv;
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|