////////////////////////////////////////////////////////////////////////////
//  File:           SparseBundleCU.cpp
//  Author:         Changchang Wu
//  Description :   implementation of the CUDA-based multicore bundle adjustment
//
//  Copyright (c) 2011  Changchang Wu (ccwu@cs.washington.edu)
//    and the University of Washington at Seattle
//
//  This library is free software; you can redistribute it and/or
//  modify it under the terms of the GNU General Public
//  License as published by the Free Software Foundation; either
//  Version 3 of the License, or (at your option) any later version.
//
//  This library is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
//  General Public License for more details.
//
////////////////////////////////////////////////////////////////////////////////

#include <vector>
#include <iostream>
#include <utility>
#include <algorithm>
#include <fstream>
#include <iomanip>
using std::vector;
using std::cout;
using std::pair;
using std::ofstream;

#include <stdlib.h>
#include <math.h>
#include <float.h>
#include "pba.h"
#include "SparseBundleCU.h"

#include "ProgramCU.h"

using namespace pba::ProgramCU;

#ifdef _WIN32
#define finite _finite
#endif

namespace pba {

typedef float float_t;  // data type for host computation; double doesn't make
                        // much difference

#define CHECK_VEC(v1, v2)                                                 \
  for (size_t j = 0; j < v1.size(); ++j) {                                \
    if (v1[j] != v2[j]) {                                                 \
      different++;                                                        \
      std::cout << i << ' ' << j << ' ' << v1[j] << ' ' << v2[j] << '\n'; \
    }                                                                     \
  }
#define DEBUG_FUNCN(v, func, input, N)                                  \
  if (__debug_pba && v.IsValid()) {                                     \
    vector<float> buf(v.GetLength()), buf_(v.GetLength());              \
    for (int i = 0; i < N; ++i) {                                       \
      int different = 0;                                                \
      func input;                                                       \
      ProgramCU::FinishWorkCUDA();                                      \
      if (i > 0) {                                                      \
        v.CopyToHost(&buf_[0]);                                         \
        CHECK_VEC(buf, buf_);                                           \
      } else {                                                          \
        v.CopyToHost(&buf[0]);                                          \
      }                                                                 \
      if (different != 0)                                               \
        std::cout << #func << " : " << i << " : " << different << '\n'; \
    }                                                                   \
  }
#define DEBUG_FUNC(v, func, input) DEBUG_FUNCN(v, func, input, 2)

SparseBundleCU::SparseBundleCU(int device)
    : ParallelBA(PBA_INVALID_DEVICE),
      _num_camera(0),
      _num_point(0),
      _num_imgpt(0),
      _num_imgpt_q(0),
      _camera_data(NULL),
      _point_data(NULL),
      _imgpt_data(NULL),
      _camera_idx(NULL),
      _point_idx(NULL),
      _projection_sse(0) {
  __selected_device = device;
}

size_t SparseBundleCU::GetMemCapacity() {
  if (__selected_device != __current_device) SetCudaDevice(__selected_device);
  size_t sz = ProgramCU::GetCudaMemoryCap();
  if (sz < 1024) std::cout << "ERROR: CUDA is unlikely to be supported!\n";
  return sz < 1024 ? 0 : sz;
}

void SparseBundleCU::SetCameraData(size_t ncam, CameraT* cams) {
  if (sizeof(CameraT) != 16 * sizeof(float)) exit(0);  // never gonna happen...?
  _num_camera = (int)ncam;
  _camera_data = cams;
  _focal_mask = NULL;
}

void SparseBundleCU::SetFocalMask(const int* fmask, float weight) {
  _focal_mask = fmask;
  _weight_q = weight;
}

void SparseBundleCU::SetPointData(size_t npoint, Point3D* pts) {
  _num_point = (int)npoint;
  _point_data = (float*)pts;
}

void SparseBundleCU::SetProjection(size_t nproj, const Point2D* imgpts,
                                   const int* point_idx, const int* cam_idx) {
  _num_imgpt = (int)nproj;
  _imgpt_data = (float*)imgpts;
  _camera_idx = cam_idx;
  _point_idx = point_idx;
  _imgpt_datax.resize(0);
}

float SparseBundleCU::GetMeanSquaredError() {
  return float(_projection_sse /
               (_num_imgpt * __focal_scaling * __focal_scaling));
}

void SparseBundleCU::BundleAdjustment() {
  if (ValidateInputData() != STATUS_SUCCESS) return;

  //

  ////////////////////////
  TimerBA timer(this, TIMER_OVERALL);

  NormalizeData();
  if (InitializeBundle() != STATUS_SUCCESS) {
    // failed to allocate gpu storage
  } else if (__profile_pba) {
    // profiling some stuff
    RunProfileSteps();
  } else {
    // real optimization
    AdjustBundleAdjsutmentMode();
    NonlinearOptimizeLM();
    TransferDataToHost();
  }
  DenormalizeData();
}

int SparseBundleCU::RunBundleAdjustment() {
  if (__warmup_device) WarmupDevice();
  ResetBundleStatistics();
  BundleAdjustment();
  if (__num_lm_success > 0)
    SaveBundleStatistics(_num_camera, _num_point, _num_imgpt);
  if (__num_lm_success > 0) PrintBundleStatistics();
  ResetTemporarySetting();
  return __num_lm_success;
}

bool SparseBundleCU::InitializeBundleGPU() {
  bool previous_allocated = __memory_usage > 0;

  bool success = TransferDataToGPU() && InitializeStorageForCG();
  if (!success && previous_allocated) {
    if (__verbose_level) std::cout << "WARNING: try clean allocation\n";
    ClearPreviousError();
    ReleaseAllocatedData();
    success = TransferDataToGPU() && InitializeStorageForCG();
  }

  if (!success && __jc_store_original) {
    if (__verbose_level) std::cout << "WARNING: try not storing original JC\n";
    __jc_store_original = false;
    ClearPreviousError();
    ReleaseAllocatedData();
    success = TransferDataToGPU() && InitializeStorageForCG();
  }
  if (!success && __jc_store_transpose) {
    if (__verbose_level) std::cout << "WARNING: try not storing transpose JC\n";
    __jc_store_transpose = false;
    ClearPreviousError();
    ReleaseAllocatedData();
    success = TransferDataToGPU() && InitializeStorageForCG();
  }
  if (!success && !__no_jacobian_store) {
    if (__verbose_level) std::cout << "WARNING: switch to memory saving mode\n";
    __no_jacobian_store = true;
    ClearPreviousError();
    ReleaseAllocatedData();
    success = TransferDataToGPU() && InitializeStorageForCG();
  }
  return success;
}

int SparseBundleCU::ValidateInputData() {
  if (_camera_data == NULL) return STATUS_CAMERA_MISSING;
  if (_point_data == NULL) return STATUS_POINT_MISSING;
  if (_imgpt_data == NULL) return STATUS_MEASURMENT_MISSING;
  if (_camera_idx == NULL || _point_idx == NULL)
    return STATUS_PROJECTION_MISSING;
  return STATUS_SUCCESS;
}

void SparseBundleCU::WarmupDevice() {
  std::cout << "Warm up device with storage allocation...\n";
  if (__selected_device != __current_device) SetCudaDevice(__selected_device);
  CheckRequiredMemX();
  InitializeBundleGPU();
}

int SparseBundleCU::InitializeBundle() {
  /////////////////////////////////////////////////////
  TimerBA timer(this, TIMER_GPU_ALLOCATION);
  if (__selected_device != __current_device) SetCudaDevice(__selected_device);
  CheckRequiredMemX();
  ReserveStorageAuto();
  if (!InitializeBundleGPU()) return STATUS_ALLOCATION_FAIL;
  return STATUS_SUCCESS;
}

int SparseBundleCU::GetParameterLength() {
  return _num_camera * 8 + 4 * _num_point;
}

bool SparseBundleCU::CheckRequiredMemX() {
  if (CheckRequiredMem(0)) return true;
  if (__jc_store_original) {
    if (__verbose_level) std::cout << "NOTE: not storing original JC\n";
    __jc_store_original = false;
    if (CheckRequiredMem(1)) return true;
  }
  if (__jc_store_transpose) {
    if (__verbose_level) std::cout << "NOTE:  not storing camera Jacobian\n";
    __jc_store_transpose = false;
    if (CheckRequiredMem(1)) return true;
  }
  if (!__no_jacobian_store) {
    if (__verbose_level) std::cout << "NOTE: not storing any Jacobian\n";
    __no_jacobian_store = true;
    if (CheckRequiredMem(1)) return true;
  }
  return false;
}

bool SparseBundleCU::CheckRequiredMem(int fresh) {
  int m = _num_camera, n = _num_point, k = _num_imgpt;
#ifdef PBA_CUDA_ALLOCATE_MORE
  if (!fresh) {
    int m0 = _cuCameraData.GetReservedWidth();
    m = std::max(m, m0);
    int n0 = _cuPointData.GetReservedWidth();
    n = std::max(n, n0);
    int k0 = _cuMeasurements.GetReservedWidth();
    k = std::max(k, k0);
  }
#endif

  int p = 8 * m + 4 * n, q = _num_imgpt_q;
  size_t szn, total = GetCudaMemoryCap();
  size_t sz0 = 800 * 600 * 2 * 4 * sizeof(float);  //
  size_t szq = q > 0 ? (sizeof(float) * (q + m) * 4) : 0;
  size_t sz = sizeof(float) * (258 + 9 * n + 33 * m + 7 * k) + sz0;

  /////////////////////////////////// CG
  sz += p * 6 * sizeof(float);
  sz += ((__use_radial_distortion ? 64 : 56) * m + 12 * n) * sizeof(float);
  sz += (2 * (k + q) * sizeof(float));
  if (sz > total) return false;

  /////////////////////////////////////
  szn = (__no_jacobian_store ? 0 : (sizeof(float) * 8 * k));
  if (sz + szn > total)
    __no_jacobian_store = false;
  else
    sz += szn;
  /////////////////////////////
  szn = ((!__no_jacobian_store && __jc_store_transpose) ? 16 * k * sizeof(float)
                                                        : 0);
  if (sz + szn > total)
    __jc_store_transpose = false;
  else
    sz += szn;
  ///////////////////////////
  szn = ((!__no_jacobian_store && __jc_store_original) ? 16 * k * sizeof(float)
                                                       : 0);
  if (sz + szn > total)
    __jc_store_original = false;
  else
    sz += szn;
  ///////////////////////////////
  szn = ((!__no_jacobian_store && __jc_store_transpose && !__jc_store_original)
             ? k * sizeof(int)
             : 0);
  if (sz + szn > total) {
    __jc_store_transpose = false;
    sz -= (16 * k * sizeof(float));
  } else
    sz += szn;

  return sz <= total;
}

void SparseBundleCU::ReserveStorage(size_t ncam, size_t npt, size_t nproj) {
  if (ncam <= 1 || npt <= 1 || nproj <= 1) {
    ReleaseAllocatedData();
    // Reset the memory strategy to the default.
    __jc_store_transpose = true;
    __jc_store_original = true;
    __no_jacobian_store = false;
  } else {
    const int* camidx = _camera_idx;
    const int* ptidx = _point_idx;
    int ncam_ = _num_camera;
    int npt_ = _num_point;
    int nproj_ = _num_imgpt;

#ifdef PBA_CUDA_ALLOCATE_MORE
    size_t ncam_reserved = _cuCameraData.GetReservedWidth();
    size_t npt_reserved = _cuPointData.GetReservedWidth();
    size_t nproj_reserved = _cuMeasurements.GetReservedWidth();
    ncam = std::max(ncam, ncam_reserved);
    npt = std::max(npt, npt_reserved);
    nproj = std::max(nproj, nproj_reserved);
#endif

    _camera_idx = NULL;
    _point_idx = NULL;
    _num_camera = (int)ncam;
    _num_point = (int)npt;
    _num_imgpt = (int)nproj;

    if (__verbose_level)
      std::cout << "Reserving storage for ncam = " << ncam << "; npt = " << npt
                << "; nproj = " << nproj << '\n';
    InitializeBundleGPU();

    _num_camera = ncam_;
    _num_point = npt_;
    _num_imgpt = nproj_;
    _camera_idx = camidx;
    _point_idx = ptidx;
  }
}

static size_t upgrade_dimension(size_t sz) {
  size_t x = 1;
  while (x < sz) x <<= 1;
  return x;
}

void SparseBundleCU::ReserveStorageAuto() {
  if (_cuCameraData.data() == NULL || _cuPointData.data() == NULL ||
      _cuMeasurements.data() == NULL)
    return;
  ReserveStorage(upgrade_dimension(_num_camera), upgrade_dimension(_num_point),
                 upgrade_dimension(_num_imgpt));
}

#define REPORT_ALLOCATION(NAME)                                   \
  if (__verbose_allocation && NAME.GetDataSize() > 1024)          \
    std::cout << (NAME.GetDataSize() > 1024 * 1024                \
                      ? NAME.GetDataSize() / 1024 / 1024          \
                      : NAME.GetDataSize() / 1024)                \
              << (NAME.GetDataSize() > 1024 * 1024 ? "MB" : "KB") \
              << "\t allocated for " #NAME "\n";

#define ASSERT_ALLOCATION(NAME)                                    \
  if (!success) {                                                  \
    std::cerr << "WARNING: failed to allocate "                    \
              << (__verbose_allocation ? #NAME "; size = " : "")   \
              << (total_sz / 1024 / 1024) << "MB + "               \
              << (NAME.GetRequiredSize() / 1024 / 1024) << "MB\n"; \
    return false;                                                  \
  } else {                                                         \
    total_sz += NAME.GetDataSize();                                \
    REPORT_ALLOCATION(NAME);                                       \
  }

#define CHECK_ALLOCATION(NAME)                                     \
  if (NAME.GetDataSize() == 0 && NAME.GetRequiredSize() > 0) {     \
    ClearPreviousError();                                          \
    std::cerr << "WARNING: unable to allocate " #NAME ": "         \
              << (NAME.GetRequiredSize() / 1024 / 1024) << "MB\n"; \
  } else {                                                         \
    total_sz += NAME.GetDataSize();                                \
    REPORT_ALLOCATION(NAME);                                       \
  }

#define ALLOCATE_REQUIRED_DATA(NAME, num, channels) \
  {                                                 \
    success &= NAME.InitTexture(num, 1, channels);  \
    ASSERT_ALLOCATION(NAME);                        \
  }

#define ALLOCATE_OPTIONAL_DATA(NAME, num, channels, option) \
  if (option) {                                             \
    option = NAME.InitTexture(num, 1, channels);            \
    CHECK_ALLOCATION(NAME);                                 \
  } else {                                                  \
    NAME.InitTexture(0, 0, 0);                              \
  }

bool SparseBundleCU::TransferDataToGPU() {
  // given m camera, npoint, k measurements.. the number of float is
  bool success = true;
  size_t total_sz = 0;

  /////////////////////////////////////////////////////////////////////////////
  vector<int> qmap, qlist;
  vector<float> qmapw, qlistw;
  ProcessIndexCameraQ(qmap, qlist);

  //////////////////////////////////////////////////////////////////////////////
  ALLOCATE_REQUIRED_DATA(_cuBufferData, 256, 1);  // 256
  ALLOCATE_REQUIRED_DATA(_cuPointData, _num_point, 4);  // 4n
  ALLOCATE_REQUIRED_DATA(_cuCameraData, _num_camera, 16);  // 16m
  ALLOCATE_REQUIRED_DATA(_cuCameraDataEX, _num_camera, 16);  // 16m

  ////////////////////////////////////////////////////////////////
  ALLOCATE_REQUIRED_DATA(_cuCameraMeasurementMap, _num_camera + 1, 1);  // m
  ALLOCATE_REQUIRED_DATA(_cuCameraMeasurementList, _num_imgpt, 1);  // k
  ALLOCATE_REQUIRED_DATA(_cuPointMeasurementMap, _num_point + 1, 1);  // n
  ALLOCATE_REQUIRED_DATA(_cuProjectionMap, _num_imgpt, 2);  // 2k
  ALLOCATE_REQUIRED_DATA(_cuImageProj, _num_imgpt + _num_imgpt_q, 2);  // 2k
  ALLOCATE_REQUIRED_DATA(_cuPointDataEX, _num_point, 4);  // 4n
  ALLOCATE_REQUIRED_DATA(_cuMeasurements, _num_imgpt, 2);  // 2k

  //
  ALLOCATE_REQUIRED_DATA(_cuCameraQMap, _num_imgpt_q, 2);
  ALLOCATE_REQUIRED_DATA(_cuCameraQMapW, _num_imgpt_q, 2);
  ALLOCATE_REQUIRED_DATA(_cuCameraQList, (_num_imgpt_q > 0 ? _num_camera : 0),
                         2);
  ALLOCATE_REQUIRED_DATA(_cuCameraQListW, (_num_imgpt_q > 0 ? _num_camera : 0),
                         2);

  if (__no_jacobian_store) {
    _cuJacobianCamera.ReleaseData();
    _cuJacobianCameraT.ReleaseData();
    _cuJacobianPoint.ReleaseData();
    _cuCameraMeasurementListT.ReleaseData();
  } else {
    ALLOCATE_REQUIRED_DATA(_cuJacobianPoint, _num_imgpt * 2, 4);  // 8k
    ALLOCATE_OPTIONAL_DATA(_cuJacobianCameraT, _num_imgpt * 2, 8,
                           __jc_store_transpose);  // 16k
    ALLOCATE_OPTIONAL_DATA(_cuJacobianCamera, _num_imgpt * 2, 8,
                           __jc_store_original);  // 16k

    if ((!__jc_store_original || __profile_pba) && __jc_store_transpose) {
      ALLOCATE_OPTIONAL_DATA(_cuCameraMeasurementListT, _num_imgpt, 1,
                             __jc_store_transpose);  // k
      if (!__jc_store_transpose) _cuJacobianCameraT.ReleaseData();
    } else {
      _cuCameraMeasurementListT.ReleaseData();
    }
  }

  /////////////////////////////////////////////////
  if (_camera_idx && _point_idx) {
    //////////////////////////////////////////
    BundleTimerSwap(TIMER_PREPROCESSING, TIMER_GPU_ALLOCATION);
    ////mapping from camera to measuremnts
    vector<int> cpi(_num_camera + 1), cpidx(_num_imgpt);
    vector<int> cpnum(_num_camera, 0);
    cpi[0] = 0;
    for (int i = 0; i < _num_imgpt; ++i) cpnum[_camera_idx[i]]++;
    for (int i = 1; i <= _num_camera; ++i) cpi[i] = cpi[i - 1] + cpnum[i - 1];
    vector<int> cptidx = cpi;
    for (int i = 0; i < _num_imgpt; ++i) cpidx[cptidx[_camera_idx[i]]++] = i;
    if (_num_imgpt_q > 0) ProcessWeightCameraQ(cpnum, qmap, qmapw, qlistw);
    BundleTimerSwap(TIMER_PREPROCESSING, TIMER_GPU_ALLOCATION);

    ///////////////////////////////////////////////////////////////////////////////
    BundleTimerSwap(TIMER_GPU_UPLOAD, TIMER_GPU_ALLOCATION);
    _cuMeasurements.CopyFromHost(_imgpt_datax.size() > 0 ? &_imgpt_datax[0]
                                                         : _imgpt_data);
    _cuCameraData.CopyFromHost(_camera_data);
    _cuPointData.CopyFromHost(_point_data);
    _cuCameraMeasurementMap.CopyFromHost(&cpi[0]);
    _cuCameraMeasurementList.CopyFromHost(&cpidx[0]);
    if (_cuCameraMeasurementListT.IsValid()) {
      vector<int> ridx(_num_imgpt);
      for (int i = 0; i < _num_imgpt; ++i) ridx[cpidx[i]] = i;
      _cuCameraMeasurementListT.CopyFromHost(&ridx[0]);
    }
    if (_num_imgpt_q > 0) {
      _cuCameraQMap.CopyFromHost(&qmap[0]);
      _cuCameraQMapW.CopyFromHost(&qmapw[0]);
      _cuCameraQList.CopyFromHost(&qlist[0]);
      _cuCameraQListW.CopyFromHost(&qlistw[0]);
    }
    BundleTimerSwap(TIMER_GPU_UPLOAD, TIMER_GPU_ALLOCATION);

    ////////////////////////////////////////////
    ///////mapping from point to measurment
    BundleTimerSwap(TIMER_PREPROCESSING, TIMER_GPU_ALLOCATION);
    vector<int> ppi(_num_point + 1);
    for (int i = 0, last_point = -1; i < _num_imgpt; ++i) {
      int pt = _point_idx[i];
      while (last_point < pt) ppi[++last_point] = i;
    }
    ppi[_num_point] = _num_imgpt;

    //////////projection map
    vector<int> projection_map(_num_imgpt * 2);
    for (int i = 0; i < _num_imgpt; ++i) {
      int* imp = &projection_map[i * 2];
      imp[0] = _camera_idx[i] * 2;
      imp[1] = _point_idx[i];
    }
    BundleTimerSwap(TIMER_PREPROCESSING, TIMER_GPU_ALLOCATION);

    //////////////////////////////////////////////////////////////
    BundleTimerSwap(TIMER_GPU_UPLOAD, TIMER_GPU_ALLOCATION);
    _cuPointMeasurementMap.CopyFromHost(&ppi[0]);
    _cuProjectionMap.CopyFromHost(&projection_map[0]);
    BundleTimerSwap(TIMER_GPU_UPLOAD, TIMER_GPU_ALLOCATION);
  }

  __memory_usage = total_sz;
  if (__verbose_level > 1)
    std::cout << "Memory for Motion/Structure/Jacobian:\t"
              << (total_sz / 1024 / 1024) << "MB\n";
  return success;
}

bool SparseBundleCU::ProcessIndexCameraQ(vector<int>& qmap,
                                         vector<int>& qlist) {
  // reset q-data
  qmap.resize(0);
  qlist.resize(0);
  _num_imgpt_q = 0;

  // verify input
  if (_camera_idx == NULL) return true;
  if (_point_idx == NULL) return true;
  if (_focal_mask == NULL) return true;
  if (_num_camera == 0) return true;
  if (_weight_q <= 0) return true;

  ///////////////////////////////////////

  int error = 0;
  vector<int> temp(_num_camera * 2, -1);

  for (int i = 0; i < _num_camera; ++i) {
    int iq = _focal_mask[i];
    if (iq > i) {
      error = 1;
      break;
    }
    if (iq < 0) continue;
    if (iq == i) continue;
    int ip = temp[2 * iq];
    // float ratio = _camera_data[i].f / _camera_data[iq].f;
    // if(ratio < 0.01 || ratio > 100)
    //{
    //  std::cout << "Warning: constaraints on largely different camreas\n";
    //  continue;
    //}else
    if (_focal_mask[iq] != iq) {
      error = 1;
      break;
    } else if (ip == -1) {
      temp[2 * iq] = i;
      temp[2 * iq + 1] = i;
      temp[2 * i] = iq;
      temp[2 * i + 1] = iq;
    } else {
      // maintain double-linked list
      temp[2 * i] = ip;
      temp[2 * i + 1] = iq;
      temp[2 * ip + 1] = i;
      temp[2 * iq] = i;
    }
  }

  if (error) {
    std::cout << "Error: incorrect constraints\n";
    _focal_mask = NULL;
    return false;
  }

  qlist.resize(_num_camera * 2, -1);
  for (int i = 0; i < _num_camera; ++i) {
    int inext = temp[2 * i + 1];
    if (inext == -1) continue;
    qlist[2 * i] = _num_imgpt + _num_imgpt_q;
    qlist[2 * inext + 1] = _num_imgpt + _num_imgpt_q;
    qmap.push_back(i);
    qmap.push_back(inext);
    _num_imgpt_q++;
  }
  return true;
}

void SparseBundleCU::ProcessWeightCameraQ(vector<int>& cpnum, vector<int>& qmap,
                                          vector<float>& qmapw,
                                          vector<float>& qlistw) {
  // set average focal length and average radial distortion
  vector<float> qpnum(_num_camera, 0), qcnum(_num_camera, 0);
  vector<float> fs(_num_camera, 0), rs(_num_camera, 0);

  for (int i = 0; i < _num_camera; ++i) {
    int qi = _focal_mask[i];
    if (qi == -1) continue;
    // float ratio = _camera_data[i].f / _camera_data[qi].f;
    // if(ratio < 0.01 || ratio > 100)      continue;
    fs[qi] += _camera_data[i].f;
    rs[qi] += _camera_data[i].radial;
    qpnum[qi] += cpnum[i];
    qcnum[qi] += 1.0f;
  }

  // this seems not really matter..they will converge anyway
  for (int i = 0; i < _num_camera; ++i) {
    int qi = _focal_mask[i];
    if (qi == -1) continue;
    // float ratio = _camera_data[i].f / _camera_data[qi].f;
    // if(ratio < 0.01 || ratio > 100)      continue;
    _camera_data[i].f = fs[qi] / qcnum[qi];
    _camera_data[i].radial = rs[qi] / qcnum[qi];
  }

  qmapw.resize(_num_imgpt_q * 2, 0);
  qlistw.resize(_num_camera * 2, 0);
  for (int i = 0; i < _num_imgpt_q; ++i) {
    int cidx = qmap[i * 2], qi = _focal_mask[cidx];
    float wi = sqrt(qpnum[qi] / qcnum[qi]) * _weight_q;
    float wr = (__use_radial_distortion ? wi * _camera_data[qi].f : 0.0);
    qmapw[i * 2] = wi;
    qmapw[i * 2 + 1] = wr;
    qlistw[cidx * 2] = wi;
    qlistw[cidx * 2 + 1] = wr;
  }
}

void SparseBundleCU::ReleaseAllocatedData() {
  _cuCameraData.ReleaseData();
  _cuCameraDataEX.ReleaseData();
  _cuPointData.ReleaseData();
  _cuPointDataEX.ReleaseData();
  _cuMeasurements.ReleaseData();
  _cuImageProj.ReleaseData();
  _cuJacobianCamera.ReleaseData();
  _cuJacobianPoint.ReleaseData();
  _cuJacobianCameraT.ReleaseData();
  _cuProjectionMap.ReleaseData();
  _cuPointMeasurementMap.ReleaseData();
  _cuCameraMeasurementMap.ReleaseData();
  _cuCameraMeasurementList.ReleaseData();
  _cuCameraMeasurementListT.ReleaseData();
  _cuBufferData.ReleaseData();
  _cuBlockPC.ReleaseData();
  _cuVectorJtE.ReleaseData();
  _cuVectorJJ.ReleaseData();
  _cuVectorJX.ReleaseData();
  _cuVectorXK.ReleaseData();
  _cuVectorPK.ReleaseData();
  _cuVectorZK.ReleaseData();
  _cuVectorRK.ReleaseData();
  _cuVectorSJ.ReleaseData();
  _cuCameraQList.ReleaseData();
  _cuCameraQMap.ReleaseData();
  _cuCameraQMapW.ReleaseData();
  _cuCameraQListW.ReleaseData();
  ProgramCU::ResetCurrentDevice();
}

void SparseBundleCU::NormalizeDataF() {
  int incompatible_radial_distortion = 0;
  if (__focal_normalize) {
    if (__focal_scaling == 1.0f) {
      //------------------------------------------------------------------
      //////////////////////////////////////////////////////////////
      vector<float> focals(_num_camera);
      for (int i = 0; i < _num_camera; ++i) focals[i] = _camera_data[i].f;
      std::nth_element(focals.begin(), focals.begin() + _num_camera / 2,
                       focals.end());
      float median_focal_length = focals[_num_camera / 2];
      __focal_scaling = __data_normalize_median / median_focal_length;
      float radial_factor = median_focal_length * median_focal_length * 4.0f;

      ///////////////////////////////
      _imgpt_datax.resize(_num_imgpt * 2);
      for (int i = 0; i < _num_imgpt * 2; ++i)
        _imgpt_datax[i] = _imgpt_data[i] * __focal_scaling;
      for (int i = 0; i < _num_camera; ++i) {
        _camera_data[i].f *= __focal_scaling;
        if (!__use_radial_distortion) {
        } else if (__reset_initial_distortion) {
          _camera_data[i].radial = 0;
        } else if (_camera_data[i].distortion_type != __use_radial_distortion) {
          incompatible_radial_distortion++;
          _camera_data[i].radial = 0;
        } else if (__use_radial_distortion == -1) {
          _camera_data[i].radial *= radial_factor;
        }
      }
      if (__verbose_level > 2)
        std::cout << "Focal length normalized by " << __focal_scaling << '\n';
      __reset_initial_distortion = false;
    }
  } else {
    if (__use_radial_distortion) {
      for (int i = 0; i < _num_camera; ++i) {
        if (__reset_initial_distortion) {
          _camera_data[i].radial = 0;
        } else if (_camera_data[i].distortion_type != __use_radial_distortion) {
          _camera_data[i].radial = 0;
          incompatible_radial_distortion++;
        }
      }
      __reset_initial_distortion = false;
    }
    _imgpt_datax.resize(0);
  }

  if (incompatible_radial_distortion) {
    std::cout << "ERROR: incompatible radial distortion input; reset to 0;\n";
  }
}

void SparseBundleCU::NormalizeDataD() {
  if (__depth_scaling == 1.0f) {
    const float dist_bound = 1.0f;
    vector<float> oz(_num_imgpt);
    vector<float> cpdist1(_num_camera, dist_bound);
    vector<float> cpdist2(_num_camera, -dist_bound);
    vector<int> camnpj(_num_camera, 0), cambpj(_num_camera, 0);
    int bad_point_count = 0;
    for (int i = 0; i < _num_imgpt; ++i) {
      int cmidx = _camera_idx[i];
      CameraT* cam = _camera_data + cmidx;
      float* rz = cam->m[2];
      float* x = _point_data + 4 * _point_idx[i];
      oz[i] = (rz[0] * x[0] + rz[1] * x[1] + rz[2] * x[2] + cam->t[2]);

      /////////////////////////////////////////////////
      // points behind camera may causes big problem
      float ozr = oz[i] / cam->t[2];
      if (fabs(ozr) < __depth_check_epsilon) {
        bad_point_count++;
        float px = cam->f * (cam->m[0][0] * x[0] + cam->m[0][1] * x[1] +
                             cam->m[0][2] * x[2] + cam->t[0]);
        float py = cam->f * (cam->m[1][0] * x[0] + cam->m[1][1] * x[1] +
                             cam->m[1][2] * x[2] + cam->t[1]);
        float mx = _imgpt_data[i * 2], my = _imgpt_data[2 * i + 1];
        bool checkx = fabs(mx) > fabs(my);
        if ((checkx && px * oz[i] * mx < 0 && fabs(mx) > 64) ||
            (!checkx && py * oz[i] * my < 0 && fabs(my) > 64)) {
          if (__verbose_level > 3)
            std::cout << "Warning: proj of #" << cmidx
                      << " on the wrong side, oz = " << oz[i] << " ("
                      << (px / oz[i]) << ',' << (py / oz[i]) << ") (" << mx
                      << ',' << my << ")\n";
          /////////////////////////////////////////////////////////////////////////
          if (oz[i] > 0)
            cpdist2[cmidx] = 0;
          else
            cpdist1[cmidx] = 0;
        }
        if (oz[i] >= 0)
          cpdist1[cmidx] = std::min(cpdist1[cmidx], oz[i]);
        else
          cpdist2[cmidx] = std::max(cpdist2[cmidx], oz[i]);
      }
      if (oz[i] < 0) {
        __num_point_behind++;
        cambpj[cmidx]++;
      }
      camnpj[cmidx]++;
    }
    if (bad_point_count > 0 && __depth_degeneracy_fix) {
      if (!__focal_normalize || !__depth_normalize)
        std::cout << "Enable data normalization on degeneracy\n";
      __focal_normalize = true;
      __depth_normalize = true;
    }
    if (__depth_normalize) {
      std::nth_element(oz.begin(), oz.begin() + _num_imgpt / 2, oz.end());
      float oz_median = oz[_num_imgpt / 2];
      float shift_min = std::min(oz_median * 0.001f, 1.0f);
      float dist_threshold = shift_min * 0.1f;
      __depth_scaling = (1.0f / oz_median) / __data_normalize_median;
      if (__verbose_level > 2)
        std::cout << "Depth normalized by " << __depth_scaling << " ("
                  << oz_median << ")\n";

      for (int i = 0; i < _num_camera; ++i) {
        // move the camera a little bit?
        if (!__depth_degeneracy_fix) {
        } else if ((cpdist1[i] < dist_threshold ||
                    cpdist2[i] > -dist_threshold)) {
          float shift = shift_min;  //(cpdist1[i] <= -cpdist2[i] ? shift_min :
                                    //-shift_min);
          // if(cpdist1[i] < dist_bound && cpdist2[i] > - dist_bound) shift = -
          // 0.5f * (cpdist1[i] + cpdist2[i]);
          bool boths =
              cpdist1[i] < dist_threshold && cpdist2[i] > -dist_threshold;
          _camera_data[i].t[2] += shift;
          if (__verbose_level > 3)
            std::cout << "Adjust C" << std::setw(5) << i << " by "
                      << std::setw(12) << shift << " [B" << std::setw(2)
                      << cambpj[i] << "/" << std::setw(5) << camnpj[i] << "] ["
                      << (boths ? 'X' : ' ') << "][" << cpdist1[i] << ", "
                      << cpdist2[i] << "]\n";
          __num_camera_modified++;
        }
        _camera_data[i].t[0] *= __depth_scaling;
        _camera_data[i].t[1] *= __depth_scaling;
        _camera_data[i].t[2] *= __depth_scaling;
      }
      for (int i = 0; i < _num_point; ++i) {
        /////////////////////////////////
        _point_data[4 * i + 0] *= __depth_scaling;
        _point_data[4 * i + 1] *= __depth_scaling;
        _point_data[4 * i + 2] *= __depth_scaling;
      }
    }
    if (__num_point_behind > 0)
      std::cout << "WARNING: " << __num_point_behind
                << " points are behind cameras.\n";
    if (__num_camera_modified > 0)
      std::cout << "WARNING: " << __num_camera_modified
                << " camera moved to avoid degeneracy.\n";
  }
}

void SparseBundleCU::NormalizeData() {
  TimerBA timer(this, TIMER_PREPROCESSING);
  NormalizeDataD();
  NormalizeDataF();
}

void SparseBundleCU::DenormalizeData() {
  if (__focal_normalize && __focal_scaling != 1.0f) {
    float squared_focal_factor = (__focal_scaling * __focal_scaling);
    for (int i = 0; i < _num_camera; ++i) {
      _camera_data[i].f /= __focal_scaling;
      if (__use_radial_distortion == -1)
        _camera_data[i].radial *= squared_focal_factor;
      _camera_data[i].distortion_type = __use_radial_distortion;
    }
    _projection_sse /= squared_focal_factor;
    __focal_scaling = 1.0f;
    _imgpt_datax.resize(0);
  } else if (__use_radial_distortion) {
    for (int i = 0; i < _num_camera; ++i)
      _camera_data[i].distortion_type = __use_radial_distortion;
  }

  if (__depth_normalize && __depth_scaling != 1.0f) {
    for (int i = 0; i < _num_camera; ++i) {
      _camera_data[i].t[0] /= __depth_scaling;
      _camera_data[i].t[1] /= __depth_scaling;
      _camera_data[i].t[2] /= __depth_scaling;
    }
    for (int i = 0; i < _num_point; ++i) {
      _point_data[4 * i + 0] /= __depth_scaling;
      _point_data[4 * i + 1] /= __depth_scaling;
      _point_data[4 * i + 2] /= __depth_scaling;
    }
    __depth_scaling = 1.0f;
  }
}

void SparseBundleCU::TransferDataToHost() {
  TimerBA timer(this, TIMER_GPU_DOWNLOAD);
  _cuCameraData.CopyToHost(_camera_data);
  _cuPointData.CopyToHost(_point_data);
}

float SparseBundleCU::EvaluateProjection(CuTexImage& cam, CuTexImage& point,
                                         CuTexImage& proj) {
  ++__num_projection_eval;
  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_PJ, true);
  ComputeProjection(cam, point, _cuMeasurements, _cuProjectionMap, proj,
                    __use_radial_distortion);
  if (_num_imgpt_q > 0)
    ComputeProjectionQ(cam, _cuCameraQMap, _cuCameraQMapW, proj, _num_imgpt);
  return (float)ComputeVectorNorm(proj, _cuBufferData);
}

float SparseBundleCU::EvaluateProjectionX(CuTexImage& cam, CuTexImage& point,
                                          CuTexImage& proj) {
  ++__num_projection_eval;
  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_PJ, true);
  ComputeProjectionX(cam, point, _cuMeasurements, _cuProjectionMap, proj,
                     __use_radial_distortion);
  if (_num_imgpt_q > 0)
    ComputeProjectionQ(cam, _cuCameraQMap, _cuCameraQMapW, proj, _num_imgpt);
  return (float)ComputeVectorNorm(proj, _cuBufferData);
}

void SparseBundleCU::DebugProjections() {
  double e1 = 0, e2 = 0;
  for (int i = 0; i < _num_imgpt; ++i) {
    float* c = (float*)(_camera_data + _camera_idx[i]);
    float* p = _point_data + 4 * _point_idx[i];
    const float* m = _imgpt_datax.size() > 0 ? (&_imgpt_datax[i * 2])
                                             : (_imgpt_data + 2 * i);
    float* r = c + 4;
    float* t = c + 1;
    float dx1, dy1;
    ////////////////////////////////////////////////////////////////////////////////
    float z = r[6] * p[0] + r[7] * p[1] + r[8] * p[2] + t[2];
    float xx = (r[0] * p[0] + r[1] * p[1] + r[2] * p[2] + t[0]);
    float yy = (r[3] * p[0] + r[4] * p[1] + r[5] * p[2] + t[1]);
    float x = xx / z;
    float y = yy / z;
    if (__use_radial_distortion == -1) {
      float rn = (m[0] * m[0] + m[1] * m[1]) * c[13] + 1.0f;
      dx1 = c[0] * x - m[0] * rn;
      dy1 = c[0] * y - m[1] * rn;
      e1 += (dx1 * dx1 + dy1 * dy1);
      e2 += (dx1 * dx1 + dy1 * dy1) / (rn * rn);
    } else if (__use_radial_distortion) {
      float rn = (x * x + y * y) * c[13] + 1.0f;
      dx1 = c[0] * x * rn - m[0];
      dy1 = c[0] * y * rn - m[1];
      e1 += (dx1 * dx1 + dy1 * dy1) / (rn * rn);
      e2 += (dx1 * dx1 + dy1 * dy1);
    } else {
      dx1 = c[0] * x - m[0];
      dy1 = c[0] * y - m[1];
      e1 += (dx1 * dx1 + dy1 * dy1);
      e2 += (dx1 * dx1 + dy1 * dy1);
    }
    if (!isfinite(dx1) || !isfinite(dy1))
      std::cout << "x = " << xx << " y = " << yy << " z = " << z << '\n'
                << "t0 = " << t[0] << " t1 = " << t[1] << " t2 = " << t[2]
                << '\n' << "p0 = " << p[0] << " p1 = " << p[1]
                << " p2 = " << p[2] << '\n';
  }
  e1 = e1 / (__focal_scaling * __focal_scaling) / _num_imgpt;
  e2 = e2 / (__focal_scaling * __focal_scaling) / _num_imgpt;
  std::cout << "DEBUG: mean squared error = " << e1
            << " in undistorted domain;\n";
  std::cout << "DEBUG: mean squared error = " << e2
            << " in distorted domain.\n";
}

bool SparseBundleCU::InitializeStorageForCG() {
  bool success = true;
  size_t total_sz = 0;
  int plen = GetParameterLength();  // q = 8m + 4n

  //////////////////////////////////////////// 6q
  ALLOCATE_REQUIRED_DATA(_cuVectorJtE, plen, 1);
  ALLOCATE_REQUIRED_DATA(_cuVectorXK, plen, 1);
  ALLOCATE_REQUIRED_DATA(_cuVectorPK, plen, 1);
  ALLOCATE_REQUIRED_DATA(_cuVectorRK, plen, 1);
  ALLOCATE_REQUIRED_DATA(_cuVectorJJ, plen, 1);
  ALLOCATE_REQUIRED_DATA(_cuVectorZK, plen, 1);

  /////////////////////////////////
  unsigned int cblock_len = (__use_radial_distortion ? 64 : 56);
  ALLOCATE_REQUIRED_DATA(_cuBlockPC, _num_camera * cblock_len + 12 * _num_point,
                         1);  // 64m + 12n
  if (__accurate_gain_ratio) {
    ALLOCATE_REQUIRED_DATA(_cuVectorJX, _num_imgpt + _num_imgpt_q, 2);  // 2k
  } else {
    _cuVectorJX.SetTexture(_cuImageProj.data(), _num_imgpt + _num_imgpt_q, 2);
  }
  ALLOCATE_OPTIONAL_DATA(_cuVectorSJ, plen, 1, __jacobian_normalize);

  /////////////////////////////////////////
  __memory_usage += total_sz;
  if (__verbose_level > 1)
    std::cout << "Memory for Conjugate Gradient Solver:\t"
              << (total_sz / 1024 / 1024) << "MB\n";
  return success;
}

void SparseBundleCU::PrepareJacobianNormalization() {
  if (!_cuVectorSJ.IsValid()) return;

  if ((__jc_store_transpose || __jc_store_original) &&
      _cuJacobianPoint.IsValid() && !__bundle_current_mode) {
    CuTexImage null;
    null.SwapData(_cuVectorSJ);
    EvaluateJacobians();
    null.SwapData(_cuVectorSJ);
    ComputeDiagonal(_cuVectorJJ, _cuVectorSJ);
    ComputeSQRT(_cuVectorSJ);
  } else {
    CuTexImage null;
    null.SwapData(_cuVectorSJ);
    EvaluateJacobians();
    ComputeBlockPC(0, true);
    null.SwapData(_cuVectorSJ);
    _cuVectorJJ.SwapData(_cuVectorSJ);
    ProgramCU::ComputeRSQRT(_cuVectorSJ);
  }
}

void SparseBundleCU::EvaluateJacobians(bool shuffle) {
  if (__no_jacobian_store) return;
  if (__bundle_current_mode == BUNDLE_ONLY_MOTION && !__jc_store_original &&
      !__jc_store_transpose)
    return;
  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_JJ, true);

  if (__jc_store_original || !__jc_store_transpose) {
    ComputeJacobian(_cuCameraData, _cuPointData, _cuJacobianCamera,
                    _cuJacobianPoint, _cuProjectionMap, _cuVectorSJ,
                    _cuMeasurements, _cuCameraMeasurementList,
                    __fixed_intrinsics, __use_radial_distortion, false);
    if (shuffle && __jc_store_transpose && _cuJacobianCameraT.IsValid())
      ShuffleCameraJacobian(_cuJacobianCamera, _cuCameraMeasurementList,
                            _cuJacobianCameraT);
  } else {
    ComputeJacobian(_cuCameraData, _cuPointData, _cuJacobianCameraT,
                    _cuJacobianPoint, _cuProjectionMap, _cuVectorSJ,
                    _cuMeasurements, _cuCameraMeasurementListT,
                    __fixed_intrinsics, __use_radial_distortion, true);
  }
  ++__num_jacobian_eval;
}

void SparseBundleCU::ComputeJtE(CuTexImage& E, CuTexImage& JtE, int mode) {
  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_JTE, true);
  if (mode == 0) mode = __bundle_current_mode;
  if (__no_jacobian_store || (!__jc_store_original && !__jc_store_transpose)) {
    ProgramCU::ComputeJtE_(E, JtE, _cuCameraData, _cuPointData, _cuMeasurements,
                           _cuCameraMeasurementMap, _cuCameraMeasurementList,
                           _cuPointMeasurementMap, _cuProjectionMap,
                           _cuJacobianPoint, __fixed_intrinsics,
                           __use_radial_distortion, mode);

    ////////////////////////////////////////////////////////////////////////////////////
    if (!_cuVectorSJ.IsValid()) {
    } else if (mode == 2) {
      if (!_cuJacobianPoint.IsValid())
        ComputeVXY(JtE, _cuVectorSJ, JtE, _num_point * 4, _num_camera * 8);
    } else if (mode == 1)
      ComputeVXY(JtE, _cuVectorSJ, JtE, _num_camera * 8);
    else
      ComputeVXY(JtE, _cuVectorSJ, JtE,
                 _cuJacobianPoint.IsValid() ? _num_camera * 8 : 0);

  } else if (__jc_store_transpose) {
    ProgramCU::ComputeJtE(E, _cuJacobianCameraT, _cuCameraMeasurementMap,
                          _cuCameraMeasurementList, _cuJacobianPoint,
                          _cuPointMeasurementMap, JtE, true, mode);
  } else {
    ProgramCU::ComputeJtE(E, _cuJacobianCamera, _cuCameraMeasurementMap,
                          _cuCameraMeasurementList, _cuJacobianPoint,
                          _cuPointMeasurementMap, JtE, false, mode);
  }

  if (mode != 2 && _num_imgpt_q > 0)
    ProgramCU::ComputeJQtEC(E, _cuCameraQList, _cuCameraQListW, _cuVectorSJ,
                            JtE);
}

void SparseBundleCU::SaveBundleRecord(int iter, float res, float damping,
                                      float& g_norm, float& g_inf) {
  // do not really compute if parameter not specified...
  // for large dataset, it never converges..
  g_inf =
      __lm_check_gradient ? ComputeVectorMax(_cuVectorJtE, _cuBufferData) : 0;
  g_norm = __save_gradient_norm
               ? float(ComputeVectorNorm(_cuVectorJtE, _cuBufferData))
               : g_inf;
  ConfigBA::SaveBundleRecord(iter, res, damping, g_norm, g_inf);
}

void SparseBundleCU::ComputeJX(CuTexImage& X, CuTexImage& JX, int mode) {
  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_JX, true);
  if (__no_jacobian_store || (__multiply_jx_usenoj && mode != 2) ||
      !__jc_store_original) {
    if (_cuVectorSJ.IsValid()) {
      if (mode == 0)
        ProgramCU::ComputeVXY(X, _cuVectorSJ, _cuVectorZK);
      else if (mode == 1)
        ProgramCU::ComputeVXY(X, _cuVectorSJ, _cuVectorZK, _num_camera * 8);
      else if (mode == 2)
        ProgramCU::ComputeVXY(X, _cuVectorSJ, _cuVectorZK, _num_point * 4,
                              _num_camera * 8);
      ProgramCU::ComputeJX_(_cuVectorZK, JX, _cuCameraData, _cuPointData,
                            _cuMeasurements, _cuProjectionMap,
                            __fixed_intrinsics, __use_radial_distortion, mode);
    } else {
      ProgramCU::ComputeJX_(X, JX, _cuCameraData, _cuPointData, _cuMeasurements,
                            _cuProjectionMap, __fixed_intrinsics,
                            __use_radial_distortion, mode);
    }
  } else {
    ProgramCU::ComputeJX(_num_camera * 2, X, _cuJacobianCamera,
                         _cuJacobianPoint, _cuProjectionMap, JX, mode);
  }

  if (_num_imgpt_q > 0 && mode != 2) {
    ProgramCU::ComputeJQX(X, _cuCameraQMap, _cuCameraQMapW, _cuVectorSJ, JX,
                          _num_imgpt);
  }
}

void SparseBundleCU::ComputeBlockPC(float lambda, bool dampd) {
  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_BC, true);

  bool use_diagonal_q = _cuCameraQListW.IsValid() && __bundle_current_mode != 2;
  if (use_diagonal_q)
    ComputeDiagonalQ(_cuCameraQListW, _cuVectorSJ, _cuVectorJJ);

  if (__no_jacobian_store || (!__jc_store_original && !__jc_store_transpose)) {
    ComputeDiagonalBlock_(
        lambda, dampd, _cuCameraData, _cuPointData, _cuMeasurements,
        _cuCameraMeasurementMap, _cuCameraMeasurementList,
        _cuPointMeasurementMap, _cuProjectionMap, _cuJacobianPoint, _cuVectorSJ,
        _cuVectorJJ, _cuBlockPC, __fixed_intrinsics, __use_radial_distortion,
        use_diagonal_q, __bundle_current_mode);
  } else if (__jc_store_transpose) {
    ComputeDiagonalBlock(lambda, dampd, _cuJacobianCameraT,
                         _cuCameraMeasurementMap, _cuJacobianPoint,
                         _cuPointMeasurementMap, _cuCameraMeasurementList,
                         _cuVectorJJ, _cuBlockPC, __use_radial_distortion, true,
                         use_diagonal_q, __bundle_current_mode);
  } else {
    ComputeDiagonalBlock(lambda, dampd, _cuJacobianCamera,
                         _cuCameraMeasurementMap, _cuJacobianPoint,
                         _cuPointMeasurementMap, _cuCameraMeasurementList,
                         _cuVectorJJ, _cuBlockPC, __use_radial_distortion,
                         false, use_diagonal_q, __bundle_current_mode);
  }
}

void SparseBundleCU::ApplyBlockPC(CuTexImage& v, CuTexImage& pv, int mode) {
  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_MP, true);
  MultiplyBlockConditioner(_num_camera, _num_point, _cuBlockPC, v, pv,
                           __use_radial_distortion, mode);
}

void SparseBundleCU::ComputeDiagonal(CuTexImage& JJ, CuTexImage& JJI) {
  ////////////////////checking the impossible.
  if (__no_jacobian_store) return;
  if (!__jc_store_transpose && !__jc_store_original) return;

  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_DD, true);
  bool use_diagonal_q = _cuCameraQListW.IsValid();
  if (use_diagonal_q) {
    CuTexImage null;
    ComputeDiagonalQ(_cuCameraQListW, null, JJ);
  }
  if (__jc_store_transpose) {
    ProgramCU::ComputeDiagonal(_cuJacobianCameraT, _cuCameraMeasurementMap,
                               _cuJacobianPoint, _cuPointMeasurementMap,
                               _cuCameraMeasurementList, JJ, JJI, true,
                               __use_radial_distortion, use_diagonal_q);
  } else {
    ProgramCU::ComputeDiagonal(_cuJacobianCamera, _cuCameraMeasurementMap,
                               _cuJacobianPoint, _cuPointMeasurementMap,
                               _cuCameraMeasurementList, JJ, JJI, false,
                               __use_radial_distortion, use_diagonal_q);
  }
}

int SparseBundleCU::SolveNormalEquationPCGX(float lambda) {
  //----------------------------------------------------------
  //(Jt * J + lambda * diag(Jt * J)) X = Jt * e
  //-------------------------------------------------------------
  TimerBA timer(this, TIMER_CG_ITERATION);
  __recent_cg_status = ' ';

  // diagonal for jacobian preconditioning...
  int plen = GetParameterLength();
  CuTexImage null;
  CuTexImage& VectorDP =
      __lm_use_diagonal_damp ? _cuVectorJJ : null;  // diagonal
  ComputeBlockPC(lambda, __lm_use_diagonal_damp);

  ///////////////////////////////////////////////////////
  // B = [BC 0 ; 0 BP]
  // m = [mc 0; 0 mp];
  // A x= BC * x - JcT * Jp * mp * JpT * Jc * x
  //   = JcT * Jc x + lambda * D * x + ........
  ////////////////////////////////////////////////////////////

  CuTexImage r;
  r.SetTexture(_cuVectorRK.data(), 8 * _num_camera);
  CuTexImage p;
  p.SetTexture(_cuVectorPK.data(), 8 * _num_camera);
  CuTexImage z;
  z.SetTexture(_cuVectorZK.data(), 8 * _num_camera);
  CuTexImage x;
  x.SetTexture(_cuVectorXK.data(), 8 * _num_camera);
  CuTexImage d;
  d.SetTexture(VectorDP.data(), 8 * _num_camera);

  CuTexImage& u = _cuVectorRK;
  CuTexImage& v = _cuVectorPK;
  CuTexImage up;
  up.SetTexture(u.data() + 8 * _num_camera, 4 * _num_point);
  CuTexImage vp;
  vp.SetTexture(v.data() + 8 * _num_camera, 4 * _num_point);
  CuTexImage uc;
  uc.SetTexture(z.data(), 8 * _num_camera);

  CuTexImage& e = _cuVectorJX;
  CuTexImage& e2 = _cuImageProj;

  ApplyBlockPC(_cuVectorJtE, u, 2);
  ComputeJX(u, e, 2);
  ComputeJtE(e, uc, 1);
  ComputeSAXPY(-1.0f, uc, _cuVectorJtE, r);  // r
  ApplyBlockPC(r, p, 1);  // z = p = M r

  float_t rtz0 = (float_t)ComputeVectorDot(r, p, _cuBufferData);  // r(0)' *
                                                                  // z(0)
  ComputeJX(p, e, 1);  // Jc * x
  ComputeJtE(e, u, 2);  // JpT * jc * x
  ApplyBlockPC(u, v, 2);
  float_t qtq0 = (float_t)ComputeVectorNorm(e, _cuBufferData);  // q(0)' * q(0)
  float_t pdp0 =
      (float_t)ComputeVectorNormW(p, d, _cuBufferData);  // p(0)' * DDD * p(0)
  float_t uv0 = (float_t)ComputeVectorDot(up, vp, _cuBufferData);
  float_t alpha0 = rtz0 / (qtq0 + lambda * pdp0 - uv0);

  if (__verbose_cg_iteration)
    std::cout << " --0,\t alpha = " << alpha0
              << ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n";
  if (!isfinite(alpha0)) {
    return 0;
  }
  if (alpha0 == 0) {
    __recent_cg_status = 'I';
    return 1;
  }

  ////////////////////////////////////////////////////////////
  ComputeSAX((float)alpha0, p, x);  // x(k+1) = x(k) + a(k) * p(k)
  ComputeJX(v, e2, 2);  //                          //Jp * mp * JpT * JcT * p
  ComputeSAXPY(-1.0f, e2, e, e);
  ComputeJtE(e, uc, 1);  // JcT * ....
  ComputeSXYPZ(lambda, d, p, uc, uc);
  ComputeSAXPY((float)-alpha0, uc, r, r);  // r(k + 1) = r(k) - a(k) * A * pk

  //////////////////////////////////////////////////////////////////////////
  float_t rtzk = rtz0, rtz_min = rtz0, betak;
  int iteration = 1;
  ++__num_cg_iteration;

  while (true) {
    ApplyBlockPC(r, z, 1);

    ///////////////////////////////////////////////////////////////////////////
    float_t rtzp = rtzk;
    rtzk = (float_t)ComputeVectorDot(
        r, z, _cuBufferData);  //[r(k + 1) = M^(-1) * z(k + 1)] * z(k+1)
    float_t rtz_ratio = sqrt(fabs(rtzk / rtz0));

    if (rtz_ratio < __cg_norm_threshold) {
      if (__recent_cg_status == ' ')
        __recent_cg_status = iteration < std::min(10, __cg_min_iteration)
                                 ? '0' + iteration
                                 : 'N';
      if (iteration >= __cg_min_iteration) break;
    }
    ////////////////////////////////////////////////////////////////////////////
    betak = rtzk / rtzp;  // beta
    rtz_min = std::min(rtz_min, rtzk);

    ComputeSAXPY((float)betak, p, z, p);  // p(k) = z(k) + b(k) * p(k - 1)
    ComputeJX(p, e, 1);  // Jc * p
    ComputeJtE(e, u, 2);  // JpT * jc * p
    ApplyBlockPC(u, v, 2);
    //////////////////////////////////////////////////////////////////////

    float_t qtqk = (float_t)ComputeVectorNorm(e, _cuBufferData);  // q(k)' q(k)
    float_t pdpk =
        (float_t)ComputeVectorNormW(p, d, _cuBufferData);  // p(k)' * DDD * p(k)
    float_t uvk = (float_t)ComputeVectorDot(up, vp, _cuBufferData);
    float_t alphak = rtzk / (qtqk + lambda * pdpk - uvk);

    /////////////////////////////////////////////////////
    if (__verbose_cg_iteration)
      std::cout << " --" << iteration << ",\t alpha= " << alphak
                << ", rtzk/rtz0 = " << rtz_ratio
                << ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n";

    ///////////////////////////////////////////////////
    if (!isfinite(alphak) || rtz_ratio > __cg_norm_guard) {
      __recent_cg_status = 'X';
      break;
    }  // something doesn't converge..

    ////////////////////////////////////////////////
    ComputeSAXPY((float)alphak, p, x, x);  // x(k+1) = x(k) + a(k) * p(k)

    /////////////////////////////////////////////////
    ++iteration;
    ++__num_cg_iteration;
    if (iteration >= std::min(__cg_max_iteration, plen)) break;

    ComputeJX(v, e2, 2);  //                          //Jp * mp * JpT * JcT * p
    ComputeSAXPY(-1.0f, e2, e, e);
    ComputeJtE(e, uc, 1);  // JcT * ....
    ComputeSXYPZ(lambda, d, p, uc, uc);
    ComputeSAXPY((float)-alphak, uc, r, r);  // r(k + 1) = r(k) - a(k) * A * pk
  }

  // if(__recent_cg_status == 'X')     return iteration;

  ComputeJX(x, e, 1);
  ComputeJtE(e, u, 2);
  CuTexImage jte_p;
  jte_p.SetTexture(_cuVectorJtE.data() + 8 * _num_camera, _num_point * 4);
  ComputeSAXPY(-1.0f, up, jte_p, vp);
  ApplyBlockPC(v, _cuVectorXK, 2);
  return iteration;
}
int SparseBundleCU::SolveNormalEquationPCGB(float lambda) {
  //----------------------------------------------------------
  //(Jt * J + lambda * diag(Jt * J)) X = Jt * e
  //-------------------------------------------------------------
  TimerBA timer(this, TIMER_CG_ITERATION);
  __recent_cg_status = ' ';

  // diagonal for jacobian preconditioning...
  int plen = GetParameterLength();
  CuTexImage null;
  CuTexImage& VectorDP =
      __lm_use_diagonal_damp ? _cuVectorJJ : null;  // diagonal
  CuTexImage& VectorQK = _cuVectorZK;  // temporary
  ComputeBlockPC(lambda, __lm_use_diagonal_damp);

  ////////////////////////////////////////////////////////
  ApplyBlockPC(_cuVectorJtE,
               _cuVectorPK);  // z(0) = p(0) = M * r(0)//r(0) = Jt * e
  ComputeJX(_cuVectorPK, _cuVectorJX);  // q(0) = J * p(0)

  //////////////////////////////////////////////////
  float_t rtz0 = (float_t)ComputeVectorDot(_cuVectorJtE, _cuVectorPK,
                                           _cuBufferData);  // r(0)' * z(0)
  float_t qtq0 =
      (float_t)ComputeVectorNorm(_cuVectorJX, _cuBufferData);  // q(0)' * q(0)
  float_t ptdp0 = (float_t)ComputeVectorNormW(
      _cuVectorPK, VectorDP, _cuBufferData);  // p(0)' * DDD * p(0)
  float_t alpha0 = rtz0 / (qtq0 + lambda * ptdp0);

  if (__verbose_cg_iteration)
    std::cout << " --0,\t alpha = " << alpha0
              << ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n";
  if (!isfinite(alpha0)) {
    return 0;
  }
  if (alpha0 == 0) {
    __recent_cg_status = 'I';
    return 1;
  }

  ////////////////////////////////////////////////////////////
  ComputeSAX((float)alpha0, _cuVectorPK,
             _cuVectorXK);  // x(k+1) = x(k) + a(k) * p(k)
  ComputeJtE(_cuVectorJX, VectorQK);  // Jt * (J * p0)

  ComputeSXYPZ(lambda, VectorDP, _cuVectorPK, VectorQK,
               VectorQK);  // Jt * J * p0 + lambda * DDD * p0
  ComputeSAXPY(
      (float)-alpha0, VectorQK, _cuVectorJtE,
      _cuVectorRK);  // r(k+1) = r(k) - a(k) * (Jt * q(k)  + DDD * p(k)) ;

  float_t rtzk = rtz0, rtz_min = rtz0, betak;
  int iteration = 1;
  ++__num_cg_iteration;

  while (true) {
    ApplyBlockPC(_cuVectorRK, _cuVectorZK);

    ///////////////////////////////////////////////////////////////////////////
    float_t rtzp = rtzk;
    rtzk = (float_t)ComputeVectorDot(
        _cuVectorRK, _cuVectorZK,
        _cuBufferData);  //[r(k + 1) = M^(-1) * z(k + 1)] * z(k+1)
    float_t rtz_ratio = sqrt(fabs(rtzk / rtz0));
    if (rtz_ratio < __cg_norm_threshold) {
      if (__recent_cg_status == ' ')
        __recent_cg_status = iteration < std::min(10, __cg_min_iteration)
                                 ? '0' + iteration
                                 : 'N';
      if (iteration >= __cg_min_iteration) break;
    }

    ////////////////////////////////////////////////////////////////////////////
    betak = rtzk / rtzp;  // beta
    rtz_min = std::min(rtz_min, rtzk);

    ComputeSAXPY((float)betak, _cuVectorPK, _cuVectorZK,
                 _cuVectorPK);  // p(k) = z(k) + b(k) * p(k - 1)
    ComputeJX(_cuVectorPK, _cuVectorJX);  // q(k) = J * p(k)
    //////////////////////////////////////////////////////////////////////

    float_t qtqk =
        (float_t)ComputeVectorNorm(_cuVectorJX, _cuBufferData);  // q(k)' q(k)
    float_t ptdpk = (float_t)ComputeVectorNormW(
        _cuVectorPK, VectorDP, _cuBufferData);  // p(k)' * DDD * p(k)
    float_t alphak = rtzk / (qtqk + lambda * ptdpk);

    /////////////////////////////////////////////////////
    if (__verbose_cg_iteration)
      std::cout << " --" << iteration << ",\t alpha= " << alphak
                << ", rtzk/rtz0 = " << rtz_ratio
                << ", t = " << BundleTimerGetNow(TIMER_CG_ITERATION) << "\n";

    ///////////////////////////////////////////////////
    if (!isfinite(alphak) || rtz_ratio > __cg_norm_guard) {
      __recent_cg_status = 'X';
      break;
    }  // something doesn't converge..

    ////////////////////////////////////////////////
    ComputeSAXPY((float)alphak, _cuVectorPK, _cuVectorXK,
                 _cuVectorXK);  // x(k+1) = x(k) + a(k) * p(k)

    /////////////////////////////////////////////////
    ++iteration;
    ++__num_cg_iteration;
    if (iteration >= std::min(__cg_max_iteration, plen)) break;

    // if(iteration == 2 && rtz_ratio < __cg_norm_threshold)
    if (__cg_recalculate_freq > 0 && iteration % __cg_recalculate_freq == 0) {
      ////r = JtE - (Jt J + lambda * D) x
      ComputeJX(_cuVectorXK, _cuVectorJX);
      ComputeJtE(_cuVectorJX, VectorQK);
      ComputeSXYPZ(lambda, VectorDP, _cuVectorXK, VectorQK, VectorQK);
      ComputeSAXPY(-1.0f, VectorQK, _cuVectorJtE, _cuVectorRK);
    } else {
      ComputeJtE(_cuVectorJX, VectorQK);
      ComputeSXYPZ(lambda, VectorDP, _cuVectorPK, VectorQK, VectorQK);  //
      ComputeSAXPY(
          (float)-alphak, VectorQK, _cuVectorRK,
          _cuVectorRK);  // r(k+1) = r(k) - a(k) * (Jt * q(k)  + DDD * p(k)) ;
    }
  }
  return iteration;
}

int SparseBundleCU::SolveNormalEquation(float lambda) {
  if (__bundle_current_mode == BUNDLE_ONLY_MOTION) {
    ComputeBlockPC(lambda, __lm_use_diagonal_damp);
    ApplyBlockPC(_cuVectorJtE, _cuVectorXK, 1);
    return 1;
  } else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
    ComputeBlockPC(lambda, __lm_use_diagonal_damp);
    ApplyBlockPC(_cuVectorJtE, _cuVectorXK, 2);
    return 1;
  } else {
    ////solve linear system using Conjugate Gradients
    return __cg_schur_complement ? SolveNormalEquationPCGX(lambda)
                                 : SolveNormalEquationPCGB(lambda);
  }
}

void SparseBundleCU::RunTestIterationLM(bool reduced) {
  EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj);
  EvaluateJacobians();
  ComputeJtE(_cuImageProj, _cuVectorJtE);
  if (reduced)
    SolveNormalEquationPCGX(__lm_initial_damp);
  else
    SolveNormalEquationPCGB(__lm_initial_damp);
  UpdateCameraPoint(_cuVectorZK, _cuImageProj);
  ComputeVectorDot(_cuVectorXK, _cuVectorJtE, _cuBufferData);
  ComputeJX(_cuVectorXK, _cuVectorJX);
  ComputeVectorNorm(_cuVectorJX, _cuBufferData);
}

float SparseBundleCU::UpdateCameraPoint(CuTexImage& dx,
                                        CuTexImage& cuImageTempProj) {
  ConfigBA::TimerBA timer(this, TIMER_FUNCTION_UP, true);
  if (__bundle_current_mode == BUNDLE_ONLY_MOTION) {
    if (__jacobian_normalize)
      ComputeVXY(_cuVectorXK, _cuVectorSJ, dx, 8 * _num_camera);
    ProgramCU::UpdateCameraPoint(_num_camera, _cuCameraData, _cuPointData, dx,
                                 _cuCameraDataEX, _cuPointDataEX,
                                 __bundle_current_mode);
    return EvaluateProjection(_cuCameraDataEX, _cuPointData, cuImageTempProj);
  } else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
    if (__jacobian_normalize)
      ComputeVXY(_cuVectorXK, _cuVectorSJ, dx, 4 * _num_point, 8 * _num_camera);
    ProgramCU::UpdateCameraPoint(_num_camera, _cuCameraData, _cuPointData, dx,
                                 _cuCameraDataEX, _cuPointDataEX,
                                 __bundle_current_mode);
    return EvaluateProjection(_cuCameraData, _cuPointDataEX, cuImageTempProj);
  } else {
    if (__jacobian_normalize) ComputeVXY(_cuVectorXK, _cuVectorSJ, dx);
    ProgramCU::UpdateCameraPoint(_num_camera, _cuCameraData, _cuPointData, dx,
                                 _cuCameraDataEX, _cuPointDataEX,
                                 __bundle_current_mode);
    return EvaluateProjection(_cuCameraDataEX, _cuPointDataEX, cuImageTempProj);
  }
}

float SparseBundleCU::SaveUpdatedSystem(float residual_reduction,
                                        float dx_sqnorm, float damping) {
  float expected_reduction;
  if (__bundle_current_mode == BUNDLE_ONLY_MOTION) {
    CuTexImage xk;
    xk.SetTexture(_cuVectorXK.data(), 8 * _num_camera);
    CuTexImage jte;
    jte.SetTexture(_cuVectorJtE.data(), 8 * _num_camera);
    float dxtg = (float)ComputeVectorDot(xk, jte, _cuBufferData);
    if (__lm_use_diagonal_damp) {
      CuTexImage jj;
      jj.SetTexture(_cuVectorJJ.data(), 8 * _num_camera);
      float dq = (float)ComputeVectorNormW(xk, jj, _cuBufferData);
      expected_reduction = damping * dq + dxtg;
    } else {
      expected_reduction = damping * dx_sqnorm + dxtg;
    }
    _cuCameraData.SwapData(_cuCameraDataEX);
  } else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
    CuTexImage xk;
    xk.SetTexture(_cuVectorXK.data() + 8 * _num_camera, 4 * _num_point);
    CuTexImage jte;
    jte.SetTexture(_cuVectorJtE.data() + 8 * _num_camera, 4 * _num_point);
    float dxtg = (float)ComputeVectorDot(xk, jte, _cuBufferData);
    if (__lm_use_diagonal_damp) {
      CuTexImage jj;
      jj.SetTexture(_cuVectorJJ.data() + 8 * _num_camera, 4 * _num_point);
      float dq = (float)ComputeVectorNormW(xk, jj, _cuBufferData);
      expected_reduction = damping * dq + dxtg;
    } else {
      expected_reduction = damping * dx_sqnorm + dxtg;
    }
    _cuPointData.SwapData(_cuPointDataEX);
  } else {
    float dxtg =
        (float)ComputeVectorDot(_cuVectorXK, _cuVectorJtE, _cuBufferData);

    if (__accurate_gain_ratio) {
      ComputeJX(_cuVectorXK, _cuVectorJX);
      float njx = (float)ComputeVectorNorm(_cuVectorJX, _cuBufferData);
      expected_reduction = 2.0f * dxtg - njx;
      // could the expected reduction be negative??? not sure
      if (expected_reduction <= 0)
        expected_reduction = 0.001f * residual_reduction;
    } else if (__lm_use_diagonal_damp) {
      float dq =
          (float)ComputeVectorNormW(_cuVectorXK, _cuVectorJJ, _cuBufferData);
      expected_reduction = damping * dq + dxtg;
    } else {
      expected_reduction = damping * dx_sqnorm + dxtg;
    }

    /// save the new motion/struture
    _cuCameraData.SwapData(_cuCameraDataEX);
    _cuPointData.SwapData(_cuPointDataEX);

    //_cuCameraData.CopyToHost(_camera_data);
    //_cuPointData.CopyToHost(_point_data);
    // DebugProjections();
  }
  ////////////////////////////////////////////
  return float(residual_reduction / expected_reduction);
}

void SparseBundleCU::AdjustBundleAdjsutmentMode() {
  if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
    _cuJacobianCamera.InitTexture(0, 0);
    _cuJacobianCameraT.InitTexture(0, 0);
  }
}

float SparseBundleCU::EvaluateDeltaNorm() {
  if (__bundle_current_mode == BUNDLE_ONLY_MOTION) {
    CuTexImage temp;
    temp.SetTexture(_cuVectorXK.data(), 8 * _num_camera);
    return ComputeVectorNorm(temp, _cuBufferData);

  } else if (__bundle_current_mode == BUNDLE_ONLY_STRUCTURE) {
    CuTexImage temp;
    temp.SetTexture(_cuVectorXK.data() + 8 * _num_camera, 4 * _num_point);
    return ComputeVectorNorm(temp, _cuBufferData);
  } else {
    return (float)ComputeVectorNorm(_cuVectorXK, _cuBufferData);
  }
}

void SparseBundleCU::NonlinearOptimizeLM() {
  ////////////////////////////////////////
  TimerBA timer(this, TIMER_OPTIMIZATION);

  ////////////////////////////////////////////////
  float mse_convert_ratio =
      1.0f / (_num_imgpt * __focal_scaling * __focal_scaling);
  float error_display_ratio = __verbose_sse ? _num_imgpt : 1.0f;
  const int edwidth = __verbose_sse ? 12 : 8;
  _projection_sse =
      EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj);
  __initial_mse = __final_mse = _projection_sse * mse_convert_ratio;

  // compute jacobian diagonals for normalization
  if (__jacobian_normalize) PrepareJacobianNormalization();

  // evalaute jacobian
  EvaluateJacobians();
  ComputeJtE(_cuImageProj, _cuVectorJtE);
  ///////////////////////////////////////////////////////////////
  if (__verbose_level)
    std::cout << "Initial " << (__verbose_sse ? "sumed" : "mean")
              << " squared error = " << __initial_mse * error_display_ratio
              << "\n----------------------------------------------\n";

  //////////////////////////////////////////////////
  CuTexImage& cuImageTempProj = _cuVectorJX;
  // CuTexImage& cuVectorTempJX  =   _cuVectorJX;
  CuTexImage& cuVectorDX = _cuVectorSJ.IsValid() ? _cuVectorZK : _cuVectorXK;

  //////////////////////////////////////////////////
  float damping_adjust = 2.0f, damping = __lm_initial_damp, g_norm, g_inf;
  SaveBundleRecord(0, _projection_sse * mse_convert_ratio, damping, g_norm,
                   g_inf);

  ////////////////////////////////////
  std::cout << std::left;
  for (int i = 0; i < __lm_max_iteration && !__abort_flag;
       __current_iteration = (++i)) {
    ////solve linear system
    int num_cg_iteration = SolveNormalEquation(damping);

    // there must be NaN somewhere
    if (num_cg_iteration == 0) {
      if (__verbose_level)
        std::cout << "#" << std::setw(3) << i << " quit on numeric errors\n";
      __pba_return_code = 'E';
      break;
    }

    // there must be infinity somewhere
    if (__recent_cg_status == 'I') {
      std::cout << "#" << std::setw(3) << i << " 0  I e=" << std::setw(edwidth)
                << "------- "
                << " u=" << std::setprecision(3) << std::setw(9) << damping
                << '\n' << std::setprecision(6);
      /////////////increase damping factor
      damping = damping * damping_adjust;
      damping_adjust = 2.0f * damping_adjust;
      --i;
      continue;
    }

    /////////////////////
    ++__num_lm_iteration;

    ////////////////////////////////////
    float dx_sqnorm = EvaluateDeltaNorm(), dx_norm = sqrt(dx_sqnorm);

    // In this library, we check absolute difference instead of realtive
    // difference
    if (dx_norm <= __lm_delta_threshold) {
      // damping factor must be way too big...or it converges
      if (__verbose_level > 1)
        std::cout << "#" << std::setw(3) << i << " " << std::setw(3)
                  << num_cg_iteration << char(__recent_cg_status)
                  << " quit on too small change (" << dx_norm << "  < "
                  << __lm_delta_threshold << ")\n";
      __pba_return_code = 'S';
      break;
    }
    ///////////////////////////////////////////////////////////////////////
    // update structure and motion, check reprojection error
    float new_residual = UpdateCameraPoint(cuVectorDX, cuImageTempProj);
    float average_residual = new_residual * mse_convert_ratio;
    float residual_reduction = _projection_sse - new_residual;

    // do we find a better solution?
    if (isfinite(new_residual) && residual_reduction > 0) {
      ////compute relative norm change
      float relative_reduction = 1.0f - (new_residual / _projection_sse);

      ////////////////////////////////////
      __num_lm_success++;  // increase counter
      _projection_sse = new_residual;  // save the new residual
      _cuImageProj.SwapData(cuImageTempProj);  // save the new projection

      ///////////////gain ratio////////////////////
      float gain_ratio =
          SaveUpdatedSystem(residual_reduction, dx_sqnorm, damping);

      /////////////////////////////////////
      SaveBundleRecord(i + 1, _projection_sse * mse_convert_ratio, damping,
                       g_norm, g_inf);

      /////////////////////////////////////////////
      if (__verbose_level > 1)
        std::cout << "#" << std::setw(3) << i << " " << std::setw(3)
                  << num_cg_iteration << char(__recent_cg_status)
                  << " e=" << std::setw(edwidth)
                  << average_residual * error_display_ratio
                  << " u=" << std::setprecision(3) << std::setw(9) << damping
                  << " r=" << std::setw(6)
                  << floor(gain_ratio * 1000.f) * 0.001f
                  << " g=" << std::setw(g_norm > 0 ? 9 : 1) << g_norm << " "
                  << std::setw(9) << relative_reduction << ' ' << std::setw(9)
                  << dx_norm << " t=" << int(BundleTimerGetNow()) << "\n"
                  << std::setprecision(6);

      /////////////////////////////
      if (!IsTimeBudgetAvailable()) {
        if (__verbose_level > 1)
          std::cout << "#" << std::setw(3) << i << " used up time budget.\n";
        __pba_return_code = 'T';
        break;
      } else if (__lm_check_gradient && g_inf < __lm_gradient_threshold) {
        if (__verbose_level > 1)
          std::cout << "#" << std::setw(3) << i
                    << " converged with small gradient\n";
        __pba_return_code = 'G';
        break;
      } else if (average_residual * error_display_ratio <= __lm_mse_threshold) {
        if (__verbose_level > 1)
          std::cout << "#" << std::setw(3) << i << " satisfies MSE threshold\n";
        __pba_return_code = 'M';
        break;
      } else {
        /////////////////////////////adjust damping factor
        float temp = gain_ratio * 2.0f - 1.0f;
        float adaptive_adjust = 1.0f - temp * temp * temp;  // powf(, 3.0f); //
        float auto_adjust = std::max(1.0f / 3.0f, adaptive_adjust);

        //////////////////////////////////////////////////
        damping = damping * auto_adjust;
        damping_adjust = 2.0f;
        if (damping < __lm_minimum_damp)
          damping = __lm_minimum_damp;
        else if (__lm_damping_auto_switch == 0 && damping > __lm_maximum_damp &&
                 __lm_use_diagonal_damp)
          damping = __lm_maximum_damp;

        EvaluateJacobians();
        ComputeJtE(_cuImageProj, _cuVectorJtE);
      }
    } else {
      if (__verbose_level > 1)
        std::cout << "#" << std::setw(3) << i << " " << std::setw(3)
                  << num_cg_iteration << char(__recent_cg_status)
                  << " e=" << std::setw(edwidth) << std::left
                  << average_residual * error_display_ratio
                  << " u=" << std::setprecision(3) << std::setw(9) << damping
                  << " r=----- " << (__lm_check_gradient || __save_gradient_norm
                                         ? " g=---------"
                                         : " g=0")
                  << " --------- " << std::setw(9) << dx_norm
                  << " t=" << int(BundleTimerGetNow()) << "\n"
                  << std::setprecision(6);

      if (__lm_damping_auto_switch > 0 && __lm_use_diagonal_damp &&
          damping > __lm_damping_auto_switch) {
        __lm_use_diagonal_damp = false;
        damping = __lm_damping_auto_switch;
        damping_adjust = 2.0f;
        if (__verbose_level > 1)
          std::cout << "NOTE: switch to damping with an identity matix\n";
      } else {
        /////////////increase damping factor
        damping = damping * damping_adjust;
        damping_adjust = 2.0f * damping_adjust;
      }
    }

    if (__verbose_level == 1) std::cout << '.';
  }

  __final_mse = float(_projection_sse * mse_convert_ratio);
  __final_mse_x =
      __use_radial_distortion
          ? EvaluateProjectionX(_cuCameraData, _cuPointData, _cuImageProj) *
                mse_convert_ratio
          : __final_mse;
}

#define PROFILE_(A, B)                    \
  BundleTimerStart(TIMER_PROFILE_STEP);   \
  for (int i = 0; i < repeat; ++i) {      \
    B;                                    \
    FinishWorkCUDA();                     \
  }                                       \
  BundleTimerSwitch(TIMER_PROFILE_STEP);  \
  std::cout << std::setw(24) << A << ": " \
            << (BundleTimerGet(TIMER_PROFILE_STEP) / repeat) << "\n";

#define PROFILE(A, B) PROFILE_(#A, A B)
#define PROXILE(A, B) PROFILE_(A, B)

void SparseBundleCU::RunProfileSteps() {
  const int repeat = __profile_pba;
  std::cout << "---------------------------------\n"
               "|    Run profiling steps ("
            << repeat << ")  |\n"
                         "---------------------------------\n"
            << std::left;
  ;

  ///////////////////////////////////////////////
  PROXILE("Upload Measurements",
          _cuMeasurements.CopyFromHost(
              _imgpt_datax.size() > 0 ? &_imgpt_datax[0] : _imgpt_data));
  PROXILE("Upload Point Data", _cuPointData.CopyToHost(_point_data));
  std::cout << "---------------------------------\n";

  /////////////////////////////////////////////
  EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj);
  PrepareJacobianNormalization();
  EvaluateJacobians();
  ComputeJtE(_cuImageProj, _cuVectorJtE);
  ComputeBlockPC(__lm_initial_damp, true);
  FinishWorkCUDA();

  do {
    if (SolveNormalEquationPCGX(__lm_initial_damp) == 10 &&
        SolveNormalEquationPCGB(__lm_initial_damp) == 10)
      break;
    __lm_initial_damp *= 2.0f;
  } while (__lm_initial_damp < 1024.0f);
  std::cout << "damping set to " << __lm_initial_damp << " for profiling\n"
            << "---------------------------------\n";

  {
    int repeat = 10, cgmin = __cg_min_iteration, cgmax = __cg_max_iteration;
    __cg_max_iteration = __cg_min_iteration = 10;
    __num_cg_iteration = 0;
    PROFILE(SolveNormalEquationPCGX, (__lm_initial_damp));
    if (__num_cg_iteration != 100)
      std::cout << __num_cg_iteration << " cg iterations in all\n";

    /////////////////////////////////////////////////////////////////////
    __num_cg_iteration = 0;
    PROFILE(SolveNormalEquationPCGB, (__lm_initial_damp));
    if (__num_cg_iteration != 100)
      std::cout << __num_cg_iteration << " cg iterations in all\n";
    std::cout << "---------------------------------\n";
    //////////////////////////////////////////////////////
    __num_cg_iteration = 0;
    PROXILE("Single iteration LMX", RunTestIterationLM(true));
    if (__num_cg_iteration != 100)
      std::cout << __num_cg_iteration << " cg iterations in all\n";
    ////////////////////////////////////////////////////////
    __num_cg_iteration = 0;
    PROXILE("Single iteration LMB", RunTestIterationLM(false));
    if (__num_cg_iteration != 100)
      std::cout << __num_cg_iteration << " cg iterations in all\n";
    std::cout << "---------------------------------\n";
    __cg_max_iteration = cgmax;
    __cg_min_iteration = cgmin;
  }
  /////////////////////////////////////////////////////
  PROFILE(UpdateCameraPoint, (_cuVectorZK, _cuImageProj));
  PROFILE(ComputeVectorNorm, (_cuVectorXK, _cuBufferData));
  PROFILE(ComputeVectorDot, (_cuVectorXK, _cuVectorRK, _cuBufferData));
  PROFILE(ComputeVectorNormW, (_cuVectorXK, _cuVectorRK, _cuBufferData));
  PROFILE(ComputeSAXPY, (0.01f, _cuVectorXK, _cuVectorRK, _cuVectorZK));
  PROFILE(ComputeSXYPZ,
          (0.01f, _cuVectorXK, _cuVectorPK, _cuVectorRK, _cuVectorZK));
  std::cout << "---------------------------------\n";
  PROFILE(ComputeVectorNorm, (_cuImageProj, _cuBufferData));
  PROFILE(ComputeSAXPY, (0.000f, _cuImageProj, _cuVectorJX, _cuVectorJX));
  std::cout << "---------------------------------\n";

  __multiply_jx_usenoj = false;
  ///////////////////////////////////////////////////////
  PROFILE(EvaluateProjection, (_cuCameraData, _cuPointData, _cuImageProj));
  PROFILE(ApplyBlockPC, (_cuVectorJtE, _cuVectorPK));
  /////////////////////////////////////////////////
  if (!__no_jacobian_store) {
    if (__jc_store_original) {
      PROFILE(ComputeJX, (_cuVectorJtE, _cuVectorJX));
      PROFILE(EvaluateJacobians, (false));

      if (__jc_store_transpose) {
        PROFILE(
            ShuffleCameraJacobian,
            (_cuJacobianCamera, _cuCameraMeasurementList, _cuJacobianCameraT));
        PROFILE(ComputeDiagonal, (_cuVectorJJ, _cuVectorPK));
        PROFILE(ComputeJtE, (_cuImageProj, _cuVectorJtE));
        PROFILE(ComputeBlockPC, (0.001f, true));

        std::cout << "---------------------------------\n"
                     "|   Not storing original  JC    | \n"
                     "---------------------------------\n";
        __jc_store_original = false;
        PROFILE(EvaluateJacobians, ());
        __jc_store_original = true;
      }
      //////////////////////////////////////////////////

      std::cout << "---------------------------------\n"
                   "|   Not storing transpose JC    | \n"
                   "---------------------------------\n";
      __jc_store_transpose = false;
      PROFILE(ComputeDiagonal, (_cuVectorJJ, _cuVectorPK));
      PROFILE(ComputeJtE, (_cuImageProj, _cuVectorJtE));
      PROFILE(ComputeBlockPC, (0.001f, true));

      //////////////////////////////////////

    } else if (__jc_store_transpose) {
      PROFILE(ComputeDiagonal, (_cuVectorJJ, _cuVectorPK));
      PROFILE(ComputeJtE, (_cuImageProj, _cuVectorJtE));
      PROFILE(ComputeBlockPC, (0.001f, true));
      std::cout << "---------------------------------\n"
                   "|   Not storing original  JC    | \n"
                   "---------------------------------\n";
      PROFILE(EvaluateJacobians, ());
    }
  }

  if (!__no_jacobian_store) {
    std::cout << "---------------------------------\n"
                 "| Not storing Camera Jacobians  | \n"
                 "---------------------------------\n";
    __jc_store_transpose = false;
    __jc_store_original = false;
    _cuJacobianCamera.ReleaseData();
    _cuJacobianCameraT.ReleaseData();
    PROFILE(EvaluateJacobians, ());
    PROFILE(ComputeJtE, (_cuImageProj, _cuVectorJtE));
    PROFILE(ComputeBlockPC, (0.001f, true));
  }

  ///////////////////////////////////////////////

  std::cout << "---------------------------------\n"
               "|   Not storing any jacobians   |\n"
               "---------------------------------\n";
  __no_jacobian_store = true;
  _cuJacobianPoint.ReleaseData();
  PROFILE(ComputeJX, (_cuVectorJtE, _cuVectorJX));
  PROFILE(ComputeJtE, (_cuImageProj, _cuVectorJtE));
  PROFILE(ComputeBlockPC, (0.001f, true));

  std::cout << "---------------------------------\n";
}

void SparseBundleCU::RunDebugSteps() {
  EvaluateProjection(_cuCameraData, _cuPointData, _cuImageProj);
  EvaluateJacobians();
  ComputeJtE(_cuImageProj, _cuVectorJtE);
  // DEBUG_FUNCN(_cuVectorXK, SolveNormalEquationPCGB, (0.001f), 100);
  DEBUG_FUNCN(_cuVectorJtE, ComputeJtE, (_cuImageProj, _cuVectorJtE), 100);
  DEBUG_FUNCN(_cuVectorJX, ComputeJX, (_cuVectorJtE, _cuVectorJX), 100);
}

void SparseBundleCU::SaveNormalEquation(float lambda) {
  ofstream out1("../../matlab/cg_j.txt");
  ofstream out2("../../matlab/cg_b.txt");
  ofstream out3("../../matlab/cg_x.txt");

  out1 << std::setprecision(20);
  out2 << std::setprecision(20);
  out3 << std::setprecision(20);

  int plen = GetParameterLength();
  vector<float> jc(16 * _num_imgpt);
  vector<float> jp(8 * _num_imgpt);
  vector<float> ee(2 * _num_imgpt);
  vector<float> dx(plen);

  _cuJacobianCamera.CopyToHost(&jc[0]);
  _cuJacobianPoint.CopyToHost(&jp[0]);
  _cuImageProj.CopyToHost(&ee[0]);
  _cuVectorXK.CopyToHost(&dx[0]);

  for (int i = 0; i < _num_imgpt; ++i) {
    out2 << ee[i * 2] << ' ' << ee[i * 2 + 1] << ' ';
    int cidx = _camera_idx[i], pidx = _point_idx[i];
    float *cp = &jc[i * 16], *pp = &jp[i * 8];
    int cmin = cidx * 8, pmin = 8 * _num_camera + pidx * 4;
    for (int j = 0; j < 8; ++j)
      out1 << (i * 2 + 1) << ' ' << (cmin + j + 1) << ' ' << cp[j] << '\n';
    for (int j = 0; j < 8; ++j)
      out1 << (i * 2 + 2) << ' ' << (cmin + j + 1) << ' ' << cp[j + 8] << '\n';
    for (int j = 0; j < 4; ++j)
      out1 << (i * 2 + 1) << ' ' << (pmin + j + 1) << ' ' << pp[j] << '\n';
    for (int j = 0; j < 4; ++j)
      out1 << (i * 2 + 2) << ' ' << (pmin + j + 1) << ' ' << pp[j + 4] << '\n';
  }

  for (size_t i = 0; i < dx.size(); ++i) out3 << dx[i] << ' ';

  std::cout << "lambda = " << std::setprecision(20) << lambda << '\n';
}

}  // namespace pba