EstervQrCode 1.1.1
Library for qr code manipulation
hierarchical_clustering_index.h
1 /***********************************************************************
2  * Software License Agreement (BSD License)
3  *
4  * Copyright 2008-2011 Marius Muja (mariusm@cs.ubc.ca). All rights reserved.
5  * Copyright 2008-2011 David G. Lowe (lowe@cs.ubc.ca). All rights reserved.
6  *
7  * THE BSD LICENSE
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  *
13  * 1. Redistributions of source code must retain the above copyright
14  * notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  * notice, this list of conditions and the following disclaimer in the
17  * documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
20  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
21  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
22  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
23  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
24  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
28  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29  *************************************************************************/
30 
31 #ifndef OPENCV_FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_
32 #define OPENCV_FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_
33 
35 
36 #include <algorithm>
37 #include <map>
38 #include <limits>
39 #include <cmath>
40 
41 #include "general.h"
42 #include "nn_index.h"
43 #include "dist.h"
44 #include "matrix.h"
45 #include "result_set.h"
46 #include "heap.h"
47 #include "allocator.h"
48 #include "random.h"
49 #include "saving.h"
50 
51 
52 namespace cvflann
53 {
54 
55 struct HierarchicalClusteringIndexParams : public IndexParams
56 {
57  HierarchicalClusteringIndexParams(int branching = 32,
58  flann_centers_init_t centers_init = FLANN_CENTERS_RANDOM,
59  int trees = 4, int leaf_size = 100)
60  {
61  (*this)["algorithm"] = FLANN_INDEX_HIERARCHICAL;
62  // The branching factor used in the hierarchical clustering
63  (*this)["branching"] = branching;
64  // Algorithm used for picking the initial cluster centers
65  (*this)["centers_init"] = centers_init;
66  // number of parallel trees to build
67  (*this)["trees"] = trees;
68  // maximum leaf size
69  (*this)["leaf_size"] = leaf_size;
70  }
71 };
72 
73 
80 template <typename Distance>
81 class HierarchicalClusteringIndex : public NNIndex<Distance>
82 {
83 public:
84  typedef typename Distance::ElementType ElementType;
85  typedef typename Distance::ResultType DistanceType;
86 
87 private:
88 
89 
90  typedef void (HierarchicalClusteringIndex::* centersAlgFunction)(int, int*, int, int*, int&);
91 
95  centersAlgFunction chooseCenters;
96 
97 
98 
109  void chooseCentersRandom(int k, int* dsindices, int indices_length, int* centers, int& centers_length)
110  {
111  UniqueRandom r(indices_length);
112 
113  int index;
114  for (index=0; index<k; ++index) {
115  bool duplicate = true;
116  int rnd;
117  while (duplicate) {
118  duplicate = false;
119  rnd = r.next();
120  if (rnd<0) {
121  centers_length = index;
122  return;
123  }
124 
125  centers[index] = dsindices[rnd];
126 
127  for (int j=0; j<index; ++j) {
128  DistanceType sq = distance(dataset[centers[index]], dataset[centers[j]], dataset.cols);
129  if (sq<1e-16) {
130  duplicate = true;
131  }
132  }
133  }
134  }
135 
136  centers_length = index;
137  }
138 
139 
150  void chooseCentersGonzales(int k, int* dsindices, int indices_length, int* centers, int& centers_length)
151  {
152  int n = indices_length;
153 
154  int rnd = rand_int(n);
155  CV_DbgAssert(rnd >=0 && rnd < n);
156 
157  centers[0] = dsindices[rnd];
158 
159  int index;
160  for (index=1; index<k; ++index) {
161 
162  int best_index = -1;
163  DistanceType best_val = 0;
164  for (int j=0; j<n; ++j) {
165  DistanceType dist = distance(dataset[centers[0]],dataset[dsindices[j]],dataset.cols);
166  for (int i=1; i<index; ++i) {
167  DistanceType tmp_dist = distance(dataset[centers[i]],dataset[dsindices[j]],dataset.cols);
168  if (tmp_dist<dist) {
169  dist = tmp_dist;
170  }
171  }
172  if (dist>best_val) {
173  best_val = dist;
174  best_index = j;
175  }
176  }
177  if (best_index!=-1) {
178  centers[index] = dsindices[best_index];
179  }
180  else {
181  break;
182  }
183  }
184  centers_length = index;
185  }
186 
187 
201  void chooseCentersKMeanspp(int k, int* dsindices, int indices_length, int* centers, int& centers_length)
202  {
203  int n = indices_length;
204 
205  double currentPot = 0;
206  DistanceType* closestDistSq = new DistanceType[n];
207 
208  // Choose one random center and set the closestDistSq values
209  int index = rand_int(n);
210  CV_DbgAssert(index >=0 && index < n);
211  centers[0] = dsindices[index];
212 
213  // Computing distance^2 will have the advantage of even higher probability further to pick new centers
214  // far from previous centers (and this complies to "k-means++: the advantages of careful seeding" article)
215  for (int i = 0; i < n; i++) {
216  closestDistSq[i] = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols);
217  closestDistSq[i] = ensureSquareDistance<Distance>( closestDistSq[i] );
218  currentPot += closestDistSq[i];
219  }
220 
221 
222  const int numLocalTries = 1;
223 
224  // Choose each center
225  int centerCount;
226  for (centerCount = 1; centerCount < k; centerCount++) {
227 
228  // Repeat several trials
229  double bestNewPot = -1;
230  int bestNewIndex = 0;
231  for (int localTrial = 0; localTrial < numLocalTries; localTrial++) {
232 
233  // Choose our center - have to be slightly careful to return a valid answer even accounting
234  // for possible rounding errors
235  double randVal = rand_double(currentPot);
236  for (index = 0; index < n-1; index++) {
237  if (randVal <= closestDistSq[index]) break;
238  else randVal -= closestDistSq[index];
239  }
240 
241  // Compute the new potential
242  double newPot = 0;
243  for (int i = 0; i < n; i++) {
244  DistanceType dist = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols);
245  newPot += std::min( ensureSquareDistance<Distance>(dist), closestDistSq[i] );
246  }
247 
248  // Store the best result
249  if ((bestNewPot < 0)||(newPot < bestNewPot)) {
250  bestNewPot = newPot;
251  bestNewIndex = index;
252  }
253  }
254 
255  // Add the appropriate center
256  centers[centerCount] = dsindices[bestNewIndex];
257  currentPot = bestNewPot;
258  for (int i = 0; i < n; i++) {
259  DistanceType dist = distance(dataset[dsindices[i]], dataset[dsindices[bestNewIndex]], dataset.cols);
260  closestDistSq[i] = std::min( ensureSquareDistance<Distance>(dist), closestDistSq[i] );
261  }
262  }
263 
264  centers_length = centerCount;
265 
266  delete[] closestDistSq;
267  }
268 
269 
287  void GroupWiseCenterChooser(int k, int* dsindices, int indices_length, int* centers, int& centers_length)
288  {
289  const float kSpeedUpFactor = 1.3f;
290 
291  int n = indices_length;
292 
293  DistanceType* closestDistSq = new DistanceType[n];
294 
295  // Choose one random center and set the closestDistSq values
296  int index = rand_int(n);
297  CV_DbgAssert(index >=0 && index < n);
298  centers[0] = dsindices[index];
299 
300  for (int i = 0; i < n; i++) {
301  closestDistSq[i] = distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols);
302  }
303 
304 
305  // Choose each center
306  int centerCount;
307  for (centerCount = 1; centerCount < k; centerCount++) {
308 
309  // Repeat several trials
310  double bestNewPot = -1;
311  int bestNewIndex = 0;
312  DistanceType furthest = 0;
313  for (index = 0; index < n; index++) {
314 
315  // We will test only the potential of the points further than current candidate
316  if( closestDistSq[index] > kSpeedUpFactor * (float)furthest ) {
317 
318  // Compute the new potential
319  double newPot = 0;
320  for (int i = 0; i < n; i++) {
321  newPot += std::min( distance(dataset[dsindices[i]], dataset[dsindices[index]], dataset.cols)
322  , closestDistSq[i] );
323  }
324 
325  // Store the best result
326  if ((bestNewPot < 0)||(newPot <= bestNewPot)) {
327  bestNewPot = newPot;
328  bestNewIndex = index;
329  furthest = closestDistSq[index];
330  }
331  }
332  }
333 
334  // Add the appropriate center
335  centers[centerCount] = dsindices[bestNewIndex];
336  for (int i = 0; i < n; i++) {
337  closestDistSq[i] = std::min( distance(dataset[dsindices[i]], dataset[dsindices[bestNewIndex]], dataset.cols)
338  , closestDistSq[i] );
339  }
340  }
341 
342  centers_length = centerCount;
343 
344  delete[] closestDistSq;
345  }
346 
347 
348 public:
349 
350 
358  HierarchicalClusteringIndex(const Matrix<ElementType>& inputData, const IndexParams& index_params = HierarchicalClusteringIndexParams(),
359  Distance d = Distance())
360  : dataset(inputData), params(index_params), root(NULL), indices(NULL), distance(d)
361  {
362  memoryCounter = 0;
363 
364  size_ = dataset.rows;
365  veclen_ = dataset.cols;
366 
367  branching_ = get_param(params,"branching",32);
368  centers_init_ = get_param(params,"centers_init", FLANN_CENTERS_RANDOM);
369  trees_ = get_param(params,"trees",4);
370  leaf_size_ = get_param(params,"leaf_size",100);
371 
372  if (centers_init_==FLANN_CENTERS_RANDOM) {
373  chooseCenters = &HierarchicalClusteringIndex::chooseCentersRandom;
374  }
375  else if (centers_init_==FLANN_CENTERS_GONZALES) {
376  chooseCenters = &HierarchicalClusteringIndex::chooseCentersGonzales;
377  }
378  else if (centers_init_==FLANN_CENTERS_KMEANSPP) {
379  chooseCenters = &HierarchicalClusteringIndex::chooseCentersKMeanspp;
380  }
381  else if (centers_init_==FLANN_CENTERS_GROUPWISE) {
382  chooseCenters = &HierarchicalClusteringIndex::GroupWiseCenterChooser;
383  }
384  else {
385  FLANN_THROW(cv::Error::StsError, "Unknown algorithm for choosing initial centers.");
386  }
387 
388  root = new NodePtr[trees_];
389  indices = new int*[trees_];
390 
391  for (int i=0; i<trees_; ++i) {
392  root[i] = NULL;
393  indices[i] = NULL;
394  }
395  }
396 
397  HierarchicalClusteringIndex(const HierarchicalClusteringIndex&);
398  HierarchicalClusteringIndex& operator=(const HierarchicalClusteringIndex&);
399 
405  virtual ~HierarchicalClusteringIndex()
406  {
407  if (root!=NULL) {
408  delete[] root;
409  }
410 
411  if (indices!=NULL) {
412  free_indices();
413  delete[] indices;
414  }
415  }
416 
420  size_t size() const CV_OVERRIDE
421  {
422  return size_;
423  }
424 
428  size_t veclen() const CV_OVERRIDE
429  {
430  return veclen_;
431  }
432 
433 
438  int usedMemory() const CV_OVERRIDE
439  {
440  return pool.usedMemory+pool.wastedMemory+memoryCounter;
441  }
442 
446  void buildIndex() CV_OVERRIDE
447  {
448  if (branching_<2) {
449  FLANN_THROW(cv::Error::StsError, "Branching factor must be at least 2");
450  }
451 
452  free_indices();
453 
454  for (int i=0; i<trees_; ++i) {
455  indices[i] = new int[size_];
456  for (size_t j=0; j<size_; ++j) {
457  indices[i][j] = (int)j;
458  }
459  root[i] = pool.allocate<Node>();
460  computeClustering(root[i], indices[i], (int)size_, branching_,0);
461  }
462  }
463 
464 
465  flann_algorithm_t getType() const CV_OVERRIDE
466  {
467  return FLANN_INDEX_HIERARCHICAL;
468  }
469 
470 
471  void saveIndex(FILE* stream) CV_OVERRIDE
472  {
473  save_value(stream, branching_);
474  save_value(stream, trees_);
475  save_value(stream, centers_init_);
476  save_value(stream, leaf_size_);
477  save_value(stream, memoryCounter);
478  for (int i=0; i<trees_; ++i) {
479  save_value(stream, *indices[i], size_);
480  save_tree(stream, root[i], i);
481  }
482 
483  }
484 
485 
486  void loadIndex(FILE* stream) CV_OVERRIDE
487  {
488  if (root!=NULL) {
489  delete[] root;
490  }
491 
492  if (indices!=NULL) {
493  free_indices();
494  delete[] indices;
495  }
496 
497  load_value(stream, branching_);
498  load_value(stream, trees_);
499  load_value(stream, centers_init_);
500  load_value(stream, leaf_size_);
501  load_value(stream, memoryCounter);
502 
503  indices = new int*[trees_];
504  root = new NodePtr[trees_];
505  for (int i=0; i<trees_; ++i) {
506  indices[i] = new int[size_];
507  load_value(stream, *indices[i], size_);
508  load_tree(stream, root[i], i);
509  }
510 
511  params["algorithm"] = getType();
512  params["branching"] = branching_;
513  params["trees"] = trees_;
514  params["centers_init"] = centers_init_;
515  params["leaf_size"] = leaf_size_;
516  }
517 
518 
528  void findNeighbors(ResultSet<DistanceType>& result, const ElementType* vec, const SearchParams& searchParams) CV_OVERRIDE
529  {
530 
531  const int maxChecks = get_param(searchParams,"checks",32);
532  const bool explore_all_trees = get_param(searchParams,"explore_all_trees",false);
533 
534  // Priority queue storing intermediate branches in the best-bin-first search
535  const cv::Ptr<Heap<BranchSt>>& heap = Heap<BranchSt>::getPooledInstance(cv::utils::getThreadID(), (int)size_);
536 
537  std::vector<bool> checked(size_,false);
538  int checks = 0;
539  for (int i=0; i<trees_; ++i) {
540  findNN(root[i], result, vec, checks, maxChecks, heap, checked, explore_all_trees);
541  if (!explore_all_trees && (checks >= maxChecks) && result.full())
542  break;
543  }
544 
545  BranchSt branch;
546  while (heap->popMin(branch) && (checks<maxChecks || !result.full())) {
547  NodePtr node = branch.node;
548  findNN(node, result, vec, checks, maxChecks, heap, checked, false);
549  }
550 
551  CV_Assert(result.full());
552  }
553 
554  IndexParams getParameters() const CV_OVERRIDE
555  {
556  return params;
557  }
558 
559 
560 private:
561 
565  struct Node
566  {
570  int pivot;
574  int size;
578  Node** childs;
582  int* indices;
586  int level;
587  };
588  typedef Node* NodePtr;
589 
590 
591 
595  typedef BranchStruct<NodePtr, DistanceType> BranchSt;
596 
597 
598 
599  void save_tree(FILE* stream, NodePtr node, int num)
600  {
601  save_value(stream, *node);
602  if (node->childs==NULL) {
603  int indices_offset = (int)(node->indices - indices[num]);
604  save_value(stream, indices_offset);
605  }
606  else {
607  for(int i=0; i<branching_; ++i) {
608  save_tree(stream, node->childs[i], num);
609  }
610  }
611  }
612 
613 
614  void load_tree(FILE* stream, NodePtr& node, int num)
615  {
616  node = pool.allocate<Node>();
617  load_value(stream, *node);
618  if (node->childs==NULL) {
619  int indices_offset;
620  load_value(stream, indices_offset);
621  node->indices = indices[num] + indices_offset;
622  }
623  else {
624  node->childs = pool.allocate<NodePtr>(branching_);
625  for(int i=0; i<branching_; ++i) {
626  load_tree(stream, node->childs[i], num);
627  }
628  }
629  }
630 
631 
635  void free_indices()
636  {
637  if (indices!=NULL) {
638  for(int i=0; i<trees_; ++i) {
639  if (indices[i]!=NULL) {
640  delete[] indices[i];
641  indices[i] = NULL;
642  }
643  }
644  }
645  }
646 
647 
648  void computeLabels(int* dsindices, int indices_length, int* centers, int centers_length, int* labels, DistanceType& cost)
649  {
650  cost = 0;
651  for (int i=0; i<indices_length; ++i) {
652  ElementType* point = dataset[dsindices[i]];
653  DistanceType dist = distance(point, dataset[centers[0]], veclen_);
654  labels[i] = 0;
655  for (int j=1; j<centers_length; ++j) {
656  DistanceType new_dist = distance(point, dataset[centers[j]], veclen_);
657  if (dist>new_dist) {
658  labels[i] = j;
659  dist = new_dist;
660  }
661  }
662  cost += dist;
663  }
664  }
665 
677  void computeClustering(NodePtr node, int* dsindices, int indices_length, int branching, int level)
678  {
679  node->size = indices_length;
680  node->level = level;
681 
682  if (indices_length < leaf_size_) { // leaf node
683  node->indices = dsindices;
684  std::sort(node->indices,node->indices+indices_length);
685  node->childs = NULL;
686  return;
687  }
688 
689  std::vector<int> centers(branching);
690  std::vector<int> labels(indices_length);
691 
692  int centers_length;
693  (this->*chooseCenters)(branching, dsindices, indices_length, &centers[0], centers_length);
694 
695  if (centers_length<branching) {
696  node->indices = dsindices;
697  std::sort(node->indices,node->indices+indices_length);
698  node->childs = NULL;
699  return;
700  }
701 
702 
703  // assign points to clusters
704  DistanceType cost;
705  computeLabels(dsindices, indices_length, &centers[0], centers_length, &labels[0], cost);
706 
707  node->childs = pool.allocate<NodePtr>(branching);
708  int start = 0;
709  int end = start;
710  for (int i=0; i<branching; ++i) {
711  for (int j=0; j<indices_length; ++j) {
712  if (labels[j]==i) {
713  std::swap(dsindices[j],dsindices[end]);
714  std::swap(labels[j],labels[end]);
715  end++;
716  }
717  }
718 
719  node->childs[i] = pool.allocate<Node>();
720  node->childs[i]->pivot = centers[i];
721  node->childs[i]->indices = NULL;
722  computeClustering(node->childs[i],dsindices+start, end-start, branching, level+1);
723  start=end;
724  }
725  }
726 
727 
728 
742  void findNN(NodePtr node, ResultSet<DistanceType>& result, const ElementType* vec, int& checks, int maxChecks,
743  const cv::Ptr<Heap<BranchSt>>& heap, std::vector<bool>& checked, bool explore_all_trees = false)
744  {
745  if (node->childs==NULL) {
746  if (!explore_all_trees && (checks>=maxChecks) && result.full()) {
747  return;
748  }
749  for (int i=0; i<node->size; ++i) {
750  int index = node->indices[i];
751  if (!checked[index]) {
752  DistanceType dist = distance(dataset[index], vec, veclen_);
753  result.addPoint(dist, index);
754  checked[index] = true;
755  ++checks;
756  }
757  }
758  }
759  else {
760  DistanceType* domain_distances = new DistanceType[branching_];
761  int best_index = 0;
762  domain_distances[best_index] = distance(vec, dataset[node->childs[best_index]->pivot], veclen_);
763  for (int i=1; i<branching_; ++i) {
764  domain_distances[i] = distance(vec, dataset[node->childs[i]->pivot], veclen_);
765  if (domain_distances[i]<domain_distances[best_index]) {
766  best_index = i;
767  }
768  }
769  for (int i=0; i<branching_; ++i) {
770  if (i!=best_index) {
771  heap->insert(BranchSt(node->childs[i],domain_distances[i]));
772  }
773  }
774  delete[] domain_distances;
775  findNN(node->childs[best_index],result,vec, checks, maxChecks, heap, checked, explore_all_trees);
776  }
777  }
778 
779 private:
780 
781 
785  const Matrix<ElementType> dataset;
786 
790  IndexParams params;
791 
792 
796  size_t size_;
797 
801  size_t veclen_;
802 
806  NodePtr* root;
807 
811  int** indices;
812 
813 
817  Distance distance;
818 
826  PooledAllocator pool;
827 
831  int memoryCounter;
832 
834  int branching_;
835  int trees_;
836  flann_centers_init_t centers_init_;
837  int leaf_size_;
838 
839 
840 };
841 
842 }
843 
845 
846 #endif /* OPENCV_FLANN_HIERARCHICAL_CLUSTERING_INDEX_H_ */
T distance(T... args)
CvMemStorage CvSeq ** labels
Definition: core_c.h:1724
double double end
Definition: core_c.h:1381
double start
Definition: core_c.h:1381
int index
Definition: core_c.h:634
CvSize size
Definition: core_c.h:112
const CvArr const CvArr CvArr * result
Definition: core_c.h:1423
#define CV_OVERRIDE
Definition: cvdef.h:792
#define CV_Assert(expr)
Checks a condition at runtime and throws exception if it fails.
Definition: base.hpp:342
#define CV_DbgAssert(expr)
Definition: base.hpp:375
CvRect r
Definition: imgproc_c.h:984
CV_EXPORTS OutputArray int double double InputArray OutputArray int int bool double k
Definition: imgproc.hpp:2133
InputArray int InputArray cost
Definition: imgproc.hpp:3387
T min(T... args)
@ StsError
unknown /unspecified error
Definition: base.hpp:71
CV_EXPORTS int getThreadID()
Definition: flann.hpp:60
T sort(T... args)
Definition: cvstd_wrapper.hpp:74
T swap(T... args)