exercise_2/myTelloProject-master/AlphaPose/PoseFlow/deepmatching/deep_matching.cpp

/*
Copyright (C) 2014 Jerome Revaud

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>
*/
#include "deep_matching.h"
#include "std.h"
#include "conv.h"
#include "maxfilter.h"


// return size of atomic patches
int get_atomic_patch_size( const dm_params_t* params )
{
  int upsize = (1 << params->prior_img_downscale);
  return 4*upsize;
}

// crop dimensions to a multiple of patch_size
void get_source_shape( const int width, const int height, const int patch_size, int* res ) {
    // crop the reference image to a multiple of patch size
    res[0] = patch_size * int(width / patch_size);
    res[1] = patch_size * int(height / patch_size);
}

// extract pixel descriptor for both images
void extract_image_desc( image_t* img0, image_t* img1, const dm_params_t* params,
                         float_layers** desc0, float_layers** desc1 )
{
    // slightly reduce img0 size to fit the patch tiling
    int patch_size = get_atomic_patch_size( params );

    int size[2]; // = {width, height}
    get_source_shape( img0->width, img0->height, patch_size, size );
    image_crop(img0, size[0], size[1]);

    // extract gradient-based information
    *desc0 = extract_desc( img0, &params->desc_params, params->n_thread );
    *desc1 = extract_desc( img1, &params->desc_params, params->n_thread );
}


void avgpool2( float_layers* hog, const dm_params_t* params )
{
    int niter = params->prior_img_downscale;
    while(niter--) {
      float_layers res = empty_layers(float,hog->tx/2,hog->ty/2,hog->tz);
      _avgpool2(hog,&res,params->n_thread);

      // replace hog by res
      free(hog->pixels);
      *hog = res;
    }
}


/* compute the grid of parent cell position, and their connection to children cells
   cells can be half-overlapping if <overlap>=1
   <dense_step> forces the grid spacing if >0
*/
void prepare_big_cells( const int imshape[2], int cell_size, int overlap, int child_overlap,
                        int_cube* child_grid, float_image* child_norms, int dense_step,
                        int_cube* grid, int_cube* children, float_image* norms )
{
    int offset, step, gtx, gty;
    if( dense_step ) {
      step = dense_step;
      offset = 0;
      // we do not care if the patches are overlapping outside the image
      #define grid_size(imsize) (1+imsize/step)
      gtx = grid_size(imshape[0]);
      gty = grid_size(imshape[1]);
      #undef grid_size
    } else {
      // we want patches fully included in the image
      offset = cell_size/2;
      step = cell_size/(overlap+1);
      #define grid_size(imsize) (1+MAX(0,imsize-2*offset)/step)
      gtx = grid_size(imshape[0]);
      gty = grid_size(imshape[1]);
      #undef grid_size
    }

    assert(!grid->pixels);
    *grid = empty_cube(int,gtx,gty,2);

    assert(0<=overlap && overlap<=1);
    int nc = pow2(2+child_overlap);  // number of children per cell
    if(child_grid) {
      assert(!norms->pixels);
      *norms = image_like(float,grid);
      assert(!children->pixels);
      *children = empty_cube(int,gtx,gty,nc);
    }

    _prepare_big_cells( cell_size, offset, step, child_grid, child_norms, grid, children, norms );
}


void sample_patches( float_layers* hog, int_cube* pos, int patch_size, int f, float norm, int n_thread,
                     float_image* patches, float_array* norms )
{
    assert(norm>0);
    const int npos = pos->tx*pos->ty;
    int_image new_pos = empty_image(int,2,npos);
    for(int i=0; i<2*npos; i++)
      new_pos.pixels[i] = (pos->pixels[i]-patch_size/2)/f;

    patch_size /= f;
    const int nh = get_patch_desc_dim(hog,patch_size);

    assert(!patches->pixels);
    *patches = empty_image(float,nh,npos);
    assert(norms->tx==npos);

    _sample_patches( hog, NULL, &new_pos, patch_size, norm, patches, norms, n_thread );

    free(new_pos.pixels);
}


const float trans_inv = 0.9f;

void convolve_atomic_patches( float_layers* source, float_layers* target,
                              const dm_params_t* params, res_scale* first_level )
{
    const int extend = 1; // slightly spatially extend response maps
    const float norm = 1; // renorm patches

    const int f = first_level->f; // scale factor w.r.t. original image
    const int psize = first_level->patch_size; // current patch size

    // first, sample patches
    float_image patches = {0};
    assert(!first_level->norms.pixels);
    first_level->norms = image_like(float, &first_level->grid);
    float_array norms_arr = {first_level->norms.pixels, (int)IMG_SIZE(&first_level->norms)};
    sample_patches( source, &first_level->grid, psize, f, norm, params->n_thread, &patches, &norms_arr );
    //hash_image(&patches)

    // rectify the norm to a boolean (0 or 1) (useless ?)
    first_level->assign = empty_array(int,norms_arr.tx);
    int n=0, tx = patches.tx;
    for(int i=0; i<norms_arr.tx; i++) {
      norms_arr.pixels[i] = norms_arr.pixels[i]>0;

      // eliminate zero-norm patches
      if( norms_arr.pixels[i] ) {
        if( n < i ) // copy
          memcpy( patches.pixels + n*tx, patches.pixels + i*tx, tx*sizeof(float));
        first_level->assign.pixels[i] = n++;
      } else
        first_level->assign.pixels[i] = -1;

      // convolution is not fully invariant to the image border:
      // blank cells outside the image are a bit disadvantageous
      if( norms_arr.pixels[i] == 0 )
        norms_arr.pixels[i] = 1-trans_inv;
    }
    patches.ty = n; // update new number of valid patches

    //hash_image(&first_level->norms)
    //hash_image(&patches)

    // compute the first level convolutions
    fastconv( &patches, target, psize/f, params->ngh_rad/f, extend, norm, params->n_thread, first_level );

    free(patches.pixels);
}

int_image* maxpool3_and_subsample2( float_layers* hog, int true_shape[2], int_image* offsets, float_layers* res, int nt )
{
  assert(!res->pixels);
  if ( offsets->pixels == NULL )
    assert( hog->tx == true_shape[0] && hog->ty == true_shape[1] );

  // set downsampled size
  true_shape[0] = (true_shape[0]+1)/2;
  true_shape[1] = (true_shape[1]+1)/2;
  assert( true_shape[0]>0 && true_shape[1]>0 );

  if ( offsets->pixels == NULL ) {
    // joint max-pooling and subsampling
    *res = empty_layers(float, true_shape[0], true_shape[1], hog->tz);
    _max_filter_3_and_subsample_layers( hog, res, nt );
    return NULL;

  } else {
    // with offsets
    float_layers maxpooled_hog = layers_like(float, hog);
    _max_filter_3_layers( hog, &maxpooled_hog, nt );
    //CHECK_MAPS(&maxpooled_hog);

    // slightly bigger, so that mininum size always >= 2
    int width = (hog->tx+2)/2;
    int height = (hog->ty+2)/2;
    *res = empty_layers(float, width, height, hog->tz);
    _subsample2_offset( &maxpooled_hog, offsets, res, nt );
    free(maxpooled_hog.pixels);

    // compute new offsets
    int_image* res_offsets = NEW(int_image);
    *res_offsets = image_like(int, offsets);
    for(long i=0; i<IMG_SIZE(offsets); i++)
      res_offsets->pixels[i] = (int)floor( offsets->pixels[i]/2.f );
    return res_offsets;
  }
}

#define CHECK_MAPS(rmaps) assert(min_array_f((rmaps)->pixels,LAYERS_SIZE(rmaps))>=0 && \
                                 max_array_f((rmaps)->pixels,LAYERS_SIZE(rmaps))<=1.001)

/* aggregate response maps of children patches to form response maps of parent patches */
int sparse_conv( int_cube* children, int_array* children_assign, float_image* child_norms,
                 int true_patch_size, float_layers* map, int_image* offsets, int nt,
                 res_scale* res )
{
  float_layers ext_map;
  if( MIN(map->tx,map->ty) < 5 ) {
    ext_map = zeros_layers(float,MAX(5,map->tx),MAX(5,map->ty),map->tz);
    for(int l=0; l<map->tz; l++)
      for(int j=0; j<map->ty; j++)
        for(int i=0; i<map->tx; i++)
          ext_map.pixels[(l*ext_map.ty + j)*ext_map.tx + i] = map->pixels[(l*map->ty + j)*map->tx + i];
    map = &ext_map;
    res->true_shape[0] = ext_map.tx;
    res->true_shape[1] = ext_map.ty;
  }

  int_image _children = reshape_z_xy(int, &res->children);

  if( offsets )
    res->offsets = empty_image(int, 2, _children.ty);

  assert(!res->res_map.pixels);
  res->res_map = empty_layers(float, map->tx, map->ty, _children.ty);
  int gap = true_patch_size / 4;
  assert(gap>0);
  float_array _norms = reshape_xy(float, &res->norms);
  float_array _child_norms = reshape_xy(float, child_norms);

  // allocate useless assign
  res->assign = empty_array(int, res->res_map.tz);
  for(int i=0; i<res->assign.tx; i++) res->assign.pixels[i] = i;

  int_array* _assign = NULL;
  int_array* _ch_assign = children_assign->pixels ? children_assign : NULL;
  int n = _sparse_conv( &_children, _ch_assign, gap, trans_inv, map, offsets,
                        &_child_norms, &_norms, _assign, &res->res_map, &res->offsets, nt );
  //CHECK_MAPS(res);

  if(map==&ext_map) free(ext_map.pixels);
  return n;
}

res_scale new_pyramid_level(int f, int psize)
{
  res_scale res = {0};          // initialize everything to 0/NULL
  res.f = f;                    // subsampling factor with respect to original image size
  res.patch_size = psize;       // patch size in original image coordinates
  return res;
}

// Compute the multi-scale pyramid response
void compute_matching_pyr( float_layers* source, float_layers* target, const dm_params_t* params,
                              matching_pyramid_t& res_maps )
{
    const int src_shape[2] = {source->tx, source->ty};
    int L = 0;  // current pyramid level
    const int atomic_psize = get_atomic_patch_size( params );
    int psize = atomic_psize; // will grow by a factor 2 at each level
    int f = psize/4;  // initial scaling factor

    // subsample if needed
    avgpool2( source, params );
    avgpool2( target, params );

    //hash_layers(source)
    //hash_layers(target)

    res_maps.clear();
    res_maps.push_back(new_pyramid_level(f,psize));
    res_scale *child, *last = &res_maps[res_maps.size()-1];

    // compute the initial patches in source image
    if( params->verbose ) std_printf("layer %d, patch_size = %dx%d\n", L, psize, psize);
    prepare_big_cells( src_shape, psize, params->overlap<L+1, 0, NULL, NULL, 0, &last->grid, NULL, NULL );
    //hash_cube(&last->grid)

    //hash_layers(source)
    convolve_atomic_patches( source, target, params, last );
    //hash_layers(&last->res_map)
    if( params->verbose )
      std_printf("remaining %ld big cells (actually, %d are unique)\n", IMG_SIZE(&last->grid), last->res_map.tz);

    // non-linear correction
    if( params->nlpow>0 )
      fastipow( &last->res_map, params->nlpow, params->n_thread );

    //hash_layers(&last->res_map)

    const int dense_step = params->subsample_ref ? 0 : psize/(1+(params->overlap<1));

    // aggregate patches for all subsequent levels
    while( 2*psize <= MIN(params->max_psize, MAX(src_shape[0], src_shape[1])) ) {
        L++;
        f *= 2;
        psize *= 2;
        res_maps.push_back(new_pyramid_level(f,psize));
        child = &res_maps[res_maps.size()-2]; // previous level
        last = &res_maps[res_maps.size()-1];  // current level
        if( params->verbose ) std_printf("layer %d, patch_size = %dx%d\n", L, psize, psize);

        // max pooling + subsampling
        //CHECK_MAPS(&child->res_map);
        last->true_shape[0] = child->true_shape[0]; // will be modified in subsampled2()
        last->true_shape[1] = child->true_shape[1];
        float_layers subs_res_map = {0};
        int_image* offsets = maxpool3_and_subsample2( &child->res_map, last->true_shape, &child->offsets,
                                                      &subs_res_map, params->n_thread );
        //CHECK_MAPS(&subs_res_map);

        // build the set of patches at this scale
        prepare_big_cells( src_shape, psize, params->overlap<L+1, params->overlap<L,
                           &child->grid, &child->norms, dense_step, &last->grid, &last->children, &last->norms );
        //DA(last->true_shape,2)
        //hash_cube(&last->grid)
        //hash_image(&last->norms)
        //hash_cube(&last->children)

        // aggregate children response maps to form parent response maps
        sparse_conv( &last->children, &child->assign, &child->norms, psize/f, &subs_res_map, offsets,
                     params->n_thread, last );
        free(subs_res_map.pixels);
        free_image(offsets);
        //CHECK_MAPS(&last->res_map);
        if( params->verbose )
          std_printf("remaining %ld big cells (actually, %d are unique)\n", IMG_SIZE(&last->grid), last->res_map.tz);

        // non-linear correction
        if( params->nlpow>0 )
          fastipow(&last->res_map, params->nlpow, params->n_thread );
        //hash_layers(&last->res_map)
    }
}


void free_matching_pyramid( matching_pyramid_t& res_maps ) {
  unsigned int i;
  for(i=0; i<res_maps.size(); i++) {
    res_scale& level = res_maps[i];

    free(level.grid.pixels);
    free(level.norms.pixels);
    free(level.assign.pixels);
    free(level.res_map.pixels);
    free(level.max_map.pixels);
    free(level.children.pixels);
    free(level.passed.pixels);
  }
}


#ifdef __APPLE__
static int arg_sort_maxima(void* arr, const void* a, const void* b) {
  float diff = ((float*)arr)[5*(*(int*)a)+4] - ((float*)arr)[5*(*(int*)b)+4];
  return (diff<0) - (diff>0); // descending order
}
#else
static int arg_sort_maxima(const void* a, const void* b, void* arr) {
  float diff = ((float*)arr)[5*(*(int*)a)+4] - ((float*)arr)[5*(*(int*)b)+4];
  return (diff<0) - (diff>0); // descending order
}
#endif

void reorder_rows( int_image* img, int_array* order )
{
  assert(order->tx==img->ty);
  const int tx = img->tx;
  int_image res = image_like(int, img);

  for(int i=0; i<order->tx; i++)
    memcpy(res.pixels + i*tx, img->pixels+order->pixels[i]*tx, tx*sizeof(int));

  free(img->pixels);
  *img = res;
}

// return points corresponding to patch matches
int_image* find_optimal_matchings( matching_pyramid_t& mp, const dm_params_t* params )
{
  const int nobordure = 0;
  int_image* maxima = NEW(int_image);
  int_array order = {0};

  if( params->maxima_mode ) { // normal process: maxima detection

    float th=0;
    int check_parents=false, check_children=false;

    float_array sc_maxima = empty_array(float,int(mp.size()));
    for(unsigned int i=0; i<mp.size(); i++) sc_maxima.pixels[i]=1;  // useless but well

    _extract_maxima( mp.data(), mp.size(), &sc_maxima, th, params->min_level, params->nlpow,
                     check_parents, check_children, nobordure, maxima, params->n_thread );
    free(sc_maxima.pixels);

    order = empty_array(int,maxima->ty);
    for(int i=0; i<maxima->ty; i++) order.pixels[i] = maxima->ty-1-i;  // last first

  } else { // we just analyse all cells at the top level
    const float_layers* rmap = &mp[mp.size()-1].res_map;
    const int tx = rmap->tx, txy=tx*rmap->ty;
    *maxima = empty_image(int, 5, (int)LAYERS_SIZE(rmap));

    for(int i=0; i<maxima->ty; i++) {
      int* row = maxima->pixels + 5*i;
      row[0] = mp.size()-1; // pyramid level
      row[1] = i/txy;       // layer number
      row[2] = i%tx;        // x position
      row[3] = (i%txy)/tx;  // y position
      ((float*)row)[4] = rmap->pixels[i];
    }
    //hash_image(maxima)

    order = empty_array(int,maxima->ty);
    for(int i=0; i<maxima->ty; i++) order.pixels[i] = i;
    #ifdef __APPLE__
    qsort_r(order.pixels, maxima->ty, sizeof(int), maxima->pixels, arg_sort_maxima);
    #else
    qsort_r(order.pixels, maxima->ty, sizeof(int), arg_sort_maxima, maxima->pixels);
    #endif
  }

  if( params->verbose>0 )
    std_printf("found %d local matches\n",maxima->ty);

  // reorder maxima
  reorder_rows( maxima, &order );
  free(order.pixels);
  return maxima;
}


static inline float ptdot( const float* m, float x, float y ) {
  return x*m[0] + y*m[1] + m[2];
}

void apply_rot( float_cube* corres, float rot[6] ) {
  assert( corres->tz == 6 );
  const int nb = IMG_SIZE(corres);
  float* p = corres->pixels;

  for(int i=0; i<nb; i++) {
    // only apply to coordinates of the first image
    float x = p[0], y = p[1];
    p[0] = ptdot(rot+0, x, y);
    p[1] = ptdot(rot+3, x, y);
    p += 6;
  }
}


/* this function gather correspondences from each local maximum in the
  response maps
*/
float_image* gather_correspondences( int src_shape[2], int target_shape[2],
                                   matching_pyramid_t& scales, int_image* maxima,
                                   const dm_params_t* params, full_corres_t* corres_out )
{
    const int step = 4*scales[0].f; // bin size
    const int n_scales = (int)scales.size();
    const int tx = maxima->tx;
    const int n_maxima = maxima->ty;

    float_cube corres0 = zeros_cube(float, (src_shape[0]+step-1)/step, (src_shape[1]+step-1)/step,6);
    float_cube corres1 = zeros_cube(float, (target_shape[0]+step-1)/step, (target_shape[1]+step-1)/step,6);

    int i;
    // allocate temporary optimization maps
    for(i=0; i<n_scales; i++) {
      long size = LAYERS_SIZE(&scales[i].res_map);
      if( params->low_mem && size > 1000003 ) size = 1000003; // big prime
      assert( size <= 2147483647 || !"try using -mem parameter");
      scales[i].passed = zeros_array(float, (int)size);
    }

    #if defined(USE_OPENMP)
    #pragma omp parallel for schedule(static,1) num_threads(params->n_thread)
    #endif
    for(i=0; i<n_maxima; i++) {
      if(params->verbose && i%100==0) std_printf("\rgathering correspondences %d%%...",100*i/n_maxima);
      int* m = maxima->pixels + tx*i;
      int level = m[0], num_map = m[1];
      int x = m[2], y = m[3];
      assert(level<n_scales);

      if( scales[level].offsets.pixels ) {
        // add offset to form real image coordinates
        x += scales[level].offsets.pixels[2*num_map+0];
        y += scales[level].offsets.pixels[2*num_map+1];
      }

      if( params->scoring_mode )  // new mode
        _argmax_correspondences( scales.data(), level, num_map, x, y, ((float*)m)[4],
                                    &corres0, step, &corres1, step, i );
      else  // old iccv mode
        _argmax_correspondences_v1( scales.data(), level, num_map, x, y, m[0]*((float*)m)[4],
                                    &corres0, step, &corres1, step, i );
    }

    // free optimization maps
    for(i=0; i<n_scales; i++) {
      free( scales[i].passed.pixels );
      scales[i].passed.pixels = NULL;
    }

    if(params->verbose) std_printf("\n");

    if( params->rot45 ) { // rectify correspondences
      assert( corres_out );
      apply_rot( &corres0, corres_out->rot );
      apply_rot( &corres1, corres_out->rot );
    }

    // keep only reciprocal matches
    int nres;
    float* corres = _intersect_corres( &corres0, &corres1, &nres );
    float_image* res = NEW(float_image);
    *res = (float_image){corres, 6, nres};

    if( corres_out == NULL ) {
      free(corres0.pixels);
      free(corres1.pixels);
    }
    else {  // save unfiltered correspondences
      corres_out->corres0 = corres0;
      corres_out->corres1 = corres1;
    }
    return res;
}


void eye_rot3x3( float rot[6] ) {
  memset( rot, 0, 6*sizeof(float));
  rot[0] = rot[4] = 1;
}

inline float bilinear_interp(const float* img, const int tx, const int ty, float x, float y ) {
  if( x < 0 || x+1.001 >= tx ) return 0;  // outside
  if( y < 0 || y+1.001 >= ty ) return 0;  // outside
  int ix = int(x);
  int iy = int(y);
  img += ix + iy*tx;  // move pointer
  float rx = x - ix;
  float ry = y - iy;
  return (1-ry)*((1-rx)*img[0] + rx*img[1]) +
            ry *((1-rx)*img[tx]+ rx*img[tx+1]);
}

void scale_rot3x3( float rot[6], float sc ) {
  for(int i=0; i<6; i++)
    rot[i] *= sc;
}

void inv_rot3x3( float rot[6], float res[6] ) {
  assert( fabs((rot[0]*rot[4] - rot[1]*rot[3]) - 1) < 1e-6 );
  // because rot is unitary, invert == transpose
  res[0] = rot[0];
  res[1] = rot[3];
  res[3] = rot[1];
  res[4] = rot[4];
  res[2] = -rot[2]*rot[0] - rot[5]*rot[3];
  res[5] = -rot[2]*rot[1] - rot[5]*rot[4];
}


// rotate a descriptor HOG image by a given angle
float_layers* rotate45( float_layers* hog, const dm_params_t* params, full_corres_t* corres_out ) {
  assert( corres_out ); // we need it to write rot !
  const int patch_size = get_atomic_patch_size( params );
  const int n_rot45 = params->rot45;

  if( (n_rot45 % 8) == 0 ) {  // nothing to do
    eye_rot3x3( corres_out->rot );
    return hog;
  }
  const int tx = hog->tx;
  const int ty = hog->ty;

  // rotation matrix
  float angle = n_rot45 * M_PI / 4;
  float c = cos(angle), s = sin(angle);
  float rot[6] = {c, -s, 0, s, c, 0};
  // pt_in_original_image = rot * pt_in_rotated_image

  // determine center of rotation before
  float cx_before = tx/2.0;
  float cy_before = ty/2.0;
  // determine center of rotation after
  float corners[2][4] = {{0, (float)tx, (float)tx, 0}, {0, 0, (float)ty, (float)ty}};
  for(int i=0; i<4; i++) {  // rotate corners
    float x = corners[0][i], y = corners[1][i];
    corners[0][i] = ptdot(rot+0, x, y);
    corners[1][i] = ptdot(rot+3, x, y);
  }
  int rot_size[2] = {int(0.5 + max_array_f(corners[0], 4) - min_array_f(corners[0], 4)),
                     int(0.5 + max_array_f(corners[1], 4) - min_array_f(corners[1], 4)) };
  get_source_shape( rot_size[0], rot_size[1], patch_size, rot_size );
  float cx_after = rot_size[0]/2.0;
  float cy_after = rot_size[1]/2.0;
  // compute the translation
  rot[2] = cx_before - ptdot(rot+0, cx_after, cy_after);
  rot[5] = cy_before - ptdot(rot+3, cx_after, cy_after);

  // create result
  assert( hog->tz == 9 );
  float_layers* rot_hog = NEW(float_layers);
  *rot_hog = empty_layers(float, rot_size[0], rot_size[1], 9);

  for(int c=0; c<hog->tz; c++) {
    const int src_c = (c<8) ? int((c+n_rot45+256)%8) : c; // roll channels except for last one (see hog.h)
    const float* f = hog->pixels + src_c * IMG_SIZE(hog);
    float* p = rot_hog->pixels + c * IMG_SIZE(rot_hog);

    for(int y=0; y<rot_size[1]; y++)
      for(int x=0; x<rot_size[0]; x++) {
        float rx = ptdot( rot+0, x, y);
        float ry = ptdot( rot+3, x, y);

        *p++ = bilinear_interp(f, tx, ty, rx, ry );
      }
  }

  // output inverted rot
  memcpy( corres_out->rot, rot, 6*sizeof(float) );

  return rot_hog;
}


// set default parameters
void set_default_dm_params( dm_params_t* params )
{
  // pixel descriptor params
  set_default_desc_params( &params->desc_params );

  // general parameters
  params->prior_img_downscale = 1;  // resolution R = 1/2^downscale, default = 1/2
  params->rot45 = 0;  // don't rotate the first image
  params->overlap = 999;  // don't use overlapping patches
  params->subsample_ref = false;  // don't subsample patches in reference image (=first image)
  params->nlpow = 1.4;
  params->ngh_rad = 0; // no limit by default
  params->maxima_mode = 0;  // don't use maxima, just start from all top patches
  params->min_level = 2;  // useless
  params->max_psize = 999; // maximum patch size
  params->low_mem = true; // optimize mem but then results are slightly unstable/non-reproducible
  params->verbose = 0;
  params->scoring_mode = 1; // improved scoring scheme
  params->n_thread = 1; // no multithreading by default
}


// main function
float_image* deep_matching( image_t* img0, image_t* img1, const dm_params_t* params, full_corres_t* corres_out )
{
  // verify parameters
  assert(between(0,params->prior_img_downscale,3));
  assert(between(0,params->overlap,999));
  assert(between(0,params->subsample_ref,1));
  assert(between(0.1,params->nlpow,10));
  assert(between(0,params->ngh_rad,1<<16));
  assert(between(0,params->maxima_mode,1));
  assert(between(0,params->min_level,4));
  assert(between(0,params->low_mem,1));
  assert(between(0,params->scoring_mode,1));
  assert(between(0,params->verbose,10));
  assert(between(1,params->n_thread,128));

  // extract pixel descriptors
  float_layers *source, *target;
  extract_image_desc( img0, img1, params, &source, &target );
  if( corres_out )  // the first image is rotated
    source = rotate45( source, params, corres_out );
  int src_shape[2] = {source->tx, source->ty};
  assert( LAYERS_SIZE(source) > 0 );
  int target_shape[2] = {target->tx, target->ty};
  assert( LAYERS_SIZE(target) > 0 );

  //hash_layers(source)
  //hash_layers(target)

  // compute local matchings
  matching_pyramid_t matching_pyr;
  compute_matching_pyr( source, target, params, matching_pyr );
  free_layers(source);
  free_layers(target);

  //hash_layers(&matching_pyr[matching_pyr.size()-1].res_map);

  // find optmial matchings (maxima)
  int_image* maxima = find_optimal_matchings(matching_pyr, params);

  //hash_image(maxima);

  // select the best displacements (maxpool merge)
  float_image* corres = gather_correspondences( src_shape, target_shape, matching_pyr, maxima, params, corres_out );

  //hash_image(corres);

  // free everything
  free_matching_pyramid(matching_pyr);
  free_layers(maxima);

  return corres;
}


void swap_first_second_img( float_cube* corres ) {
  assert( corres->tz == 6 );
  const int nb = IMG_SIZE(corres);
  float* p = corres->pixels;

  for(int i = 0; i < nb; i++) {
    float a = p[0];
    float b = p[1];
    float c = p[2];
    float d = p[3];
    *p++ = c;
    *p++ = d;
    *p++ = a;
    *p++ = b;
    p += 2;
  }
}

void rescale_corres( float_cube* corres, float f0, float f1, int code ) {
  assert( corres->tz == 6 );
  const int nb = IMG_SIZE(corres);
  float* p = corres->pixels;

  for(int i = 0; i < nb; i++) {
    p[0] *= f0;
    p[1] *= f0;
    p[2] *= f1;
    p[3] *= f1;
    p[5] = code;
    p += 6;
  }
}

// set default parameters
void set_default_scalerot_params( scalerot_params_t* params ) {
  params->fast = true;
  params->min_sc0 = 0;  // scale = 2^(-0/2) = 1
  params->max_sc0 = 5;  // scale = 2^(-5/2) = 0.176
  params->min_sc1 = 0;
  params->max_sc1 = 5;
  params->min_rot = 0;  // rot = 0*45 = 0
  params->max_rot = 8;  // rot = 8*45 = 360
}


// main function for scale/rotation invariant version
float_image* deep_matching_scale_rot( image_t* img0, image_t* img1, dm_params_t* params,
                                      const scalerot_params_t* sr_params ) {
  // verify parameters
  assert(sr_params->min_sc0 < sr_params->max_sc0);
  assert(sr_params->min_sc1 < sr_params->max_sc1);
  assert(between(0, sr_params->min_sc0, 5));
  assert(between(0, sr_params->max_sc0, 5));
  assert(between(0, sr_params->min_sc1, 5));
  assert(between(0, sr_params->max_sc1, 5));
  assert(sr_params->min_rot >= 0);
  assert(between(1,sr_params->max_rot - sr_params->min_rot, 8));

  // init shape
  const int psize = get_atomic_patch_size(params);
  int imshape0[2];
  get_source_shape( img0->width, img0->height, psize, imshape0 );
  int imshape1[2] = {img1->width, img1->height};

  // check dm params to ensure everything goes fine from now on
  #define mean_dim(shape)  ((shape[0] + shape[1])/2)
  params->max_psize = MIN(mean_dim(imshape0), mean_dim(imshape1));
  const int verbose = params->verbose;
  params->verbose = MAX(0, verbose - 1);  // decrease for inner deepmatchings

  // prepare output
  const int step0 = psize/2;
  const int step1 = psize/2;
  float_cube all_corres0 = zeros_cube(float, (imshape0[0]+step0/2-1)/step0, (imshape0[1]+step0/2-1)/step0, 6);
  float_cube all_corres1 = zeros_cube(float, (imshape1[0]+step1/2-1)/step1, (imshape1[1]+step1/2-1)/step1, 6);
  full_corres_t out;

  const int NS = 5;
  image_t *scaled_images1[NS] = {NULL};

  // loop over all scale*rot combinations
  for(int sc0 = sr_params->min_sc0;
          sc0 < sr_params->max_sc0;
          sc0++) {
    const float scale0 = pow(2, -0.5*sc0 ); // scale factor for img0
    assert( scale0<=1 && sc0<5 );
    image_t* scaled_img0 = ( scale0 >= 1 ) ? img0 :
                            image_resize_bilinear_scale( img0, scale0 );

    for(int sc1 = sr_params->min_sc1;
            sc1 < sr_params->max_sc1;
            sc1++) {
      const float scale1 = pow(2, -0.5*sc1 ); // scale factor for img1
      assert( scale1<=1 && sc1<5 );
      // optimization, deactivate only if eg. both images are blurry
      if( sr_params->fast && !(scale0==1 || scale1==1)) continue;

      image_t* scaled_img1 = scaled_images1[sc1 - sr_params->min_sc1];
      if( scaled_img1 == NULL ) {
        scaled_img1 = ( scale1 >= 1 ) ? img1 :
                        image_resize_bilinear_scale( img1, scale1 );
        // remember result
        scaled_images1[sc1 - sr_params->min_sc1] = scaled_img1;
      }

      for(int rotation = sr_params->min_rot;
              rotation < sr_params->max_rot;
              rotation++) {
        assert( rotation >= 0 );
        const int rot_scale_code = 8*(sc1*5+sc0) + (rotation%8); // cannot be negative, because of bin count

        if( verbose )
          std_printf( "processing scale = (x%g, x%g) + rotation = %d deg (code %d)...\n",
                           scale0, scale1, 45*rotation, rot_scale_code);

        float rot0[6], rot1[6];

        // compute correspondences with rotated+scaled image
        #define max_dim(img)  MAX(img->width, img->height)
        if( max_dim(scaled_img0) >= max_dim(scaled_img1) ) { // first image is always the largest
          params->rot45 = rotation;

          float_image* corres = deep_matching(scaled_img0, scaled_img1, params, &out );
          free_image( corres ); // we don't care

          inv_rot3x3(out.rot, rot0);
          eye_rot3x3(rot1);

        } else {  // scaled_img1 is larger
          params->rot45 = -rotation;

          float_image* corres = deep_matching(scaled_img1, scaled_img0, params, &out );
          free_image( corres ); // we don't care

          // swap first and second image coordinates
          memswap( &out.corres0, &out.corres1, sizeof(float_cube) );
          swap_first_second_img( &out.corres0 );
          swap_first_second_img( &out.corres1 );

          inv_rot3x3(out.rot, rot1);
          eye_rot3x3(rot0);
        }

        // change scale of correspondences
        rescale_corres( &out.corres0, 1/scale0, 1/scale1, rot_scale_code );
        rescale_corres( &out.corres1, 1/scale0, 1/scale1, rot_scale_code );
        scale_rot3x3(rot0, scale0);
        scale_rot3x3(rot1, scale1);

        // merge correspondences in the reference frame
        merge_corres( rot0, rot1,
                      psize, psize, &out.corres0, &out.corres1, 2,
                      step0, step1, &all_corres0, &all_corres1 ); // finer grid for merge

        free(out.corres0.pixels);
        free(out.corres1.pixels);
      }
    }

    // free memory
    if( img0 != scaled_img0 )
      image_delete( scaled_img0 );
  }

  // final intersection
  int nres;
  float* corres = _intersect_corres( &all_corres0, &all_corres1, &nres );
  float_image* res = NEW(float_image);
  *res = (float_image){corres, 6, nres};

  // free memory
  for(int i=0; i<NS; i++)
    if( scaled_images1[i] != img1 )
      image_delete( scaled_images1[i] );
  free(all_corres0.pixels);
  free(all_corres1.pixels);

  return res;
}