BICO  1.0
 All Classes Namespaces Files Functions Variables Typedefs Pages
bico.h
Go to the documentation of this file.
1 #ifndef BICO_H
2 #define BICO_H
3 
4 #include <array>
5 #include <cmath>
6 #include <memory>
7 #include <random>
8 #include <vector>
9 
10 #include "../base/streamingalgorithm.h"
11 #include "../base/dissimilaritymeasure.h"
12 #include "../base/solutionprovider.h"
13 #include "../base/weightmodifier.h"
14 #include "../base/partitionprovider.h"
15 #include "../clustering/cfrentry.h"
16 #include "../datastructure/proxysolution.h"
17 #include "../evaluation/kmeansevaluator.h"
18 #include "../exception/invalidruntimeconfigurationexception.h"
19 #include "../misc/randomness.h"
20 
21 namespace CluE
22 {
23 
42 template<typename T> class Bico : public StreamingAlgorithm<T>
43 {
44 private:
45 
49  class BicoNode
50  {
51  public:
52  typedef std::pair<CFREntry<T>, BicoNode*> FeaturePair;
53  typedef std::list<FeaturePair> FeatureList;
54 
60  objectId(outer.nodeIdCounter),
61  outer(outer),
62  features()
63  {
64  ++outer.nodeIdCounter;
65  }
66 
70  void clear()
71  {
72  for (auto it = features.begin(); it != features.end(); ++it)
73  delete it->second;
74  }
75 
81  typename FeatureList::iterator insert(CFREntry<T> const & feature)
82  {
83  return features.insert(features.end(),
84  FeaturePair(feature, new BicoNode(outer)));
85  }
86 
91  typename FeatureList::iterator begin()
92  {
93  return features.begin();
94  }
95 
100  typename FeatureList::iterator end()
101  {
102  return features.end();
103  }
104 
109  size_t size()
110  {
111  return features.size();
112  }
113 
118  bool empty()
119  {
120  return features.empty();
121  }
122 
130  typename FeatureList::iterator nearest(T const & element, int level)
131  {
132  typename FeatureList::iterator minIt = features.end();
133  // Nearest neighbour search based on projections in level 1
134  if (level == 1)
135  {
136  // Project point and calculate projection bucket number
137  double val = outer.project(element, 0);
138  int bucket_number = outer.calcBucketNumber(0, val);
139  int mini = 0;
140  int bucket_min = bucket_number;
141  int mins;
142 
143  if ((bucket_number < 0) || (bucket_number > outer.buckets[0].size() - 1))
144  {
145  // The bucket does not exist (yet)
146  mins = 0;
147  }
148  else
149  {
150  // Search for the projection with smallest bucket size
151  mins = outer.buckets[mini][bucket_min].size();
152  for (int i = 1; i < outer.L; i++)
153  {
154  val = outer.project(element, i);
155  bucket_number = outer.calcBucketNumber(i, val);
156  if ((bucket_number >= 0) & (bucket_number <= outer.buckets[i].size() - 1))
157  {
158  int s = outer.buckets[i][bucket_number].size();
159  if (s < mins)
160  {
161  mins = s;
162  bucket_min = bucket_number;
163  mini = i;
164  }
165  }
166  else
167  {
168  mins = 0;
169  bucket_min = bucket_number;
170  mini = i;
171  break;
172  }
173  }
174 
175  }
176 
177  bucket_number = bucket_min;
178  int rnd = mini;
179 
180  if (bucket_number < 0)
181  {
182  // Bucket does not exist => create one
183  outer.allocateBucket(rnd, true);
184  }
185  else if (bucket_number > outer.buckets[rnd].size() - 1)
186  {
187  // Bucket does not exist => create one
188  outer.allocateBucket(rnd, false);
189  }
190  else
191  {
192  // Bucket does exist => search nearest point in bucket
193  double minDist = -1;
194 
195  for (auto it = outer.buckets[rnd][bucket_number].begin(); it != outer.buckets[rnd][bucket_number].end(); ++it)
196  {
197  double tmpDist = outer.measure->dissimilarity((*it)->first.representative, element);
198  if (tmpDist < minDist || minDist == -1)
199  {
200  minDist = tmpDist;
201  minIt = (*it);
202  }
203  }
204 
205  }
206  }
207  // Simple nearest neighbour search in all other levels
208  else
209  {
210  double minDist = -1;
211  for (auto it = features.begin(); it != features.end(); ++it)
212  {
213  double tmpDist = outer.measure->dissimilarity(it->first.representative, element);
214  if (tmpDist < minDist || minDist == -1)
215  {
216  minDist = tmpDist;
217  minIt = it;
218  }
219  }
220  }
221 
222  return minIt;
223  }
224 
229  void erase(typename FeatureList::iterator pos)
230  {
231  features.erase(pos);
232  }
233 
239  void spliceAllTo(BicoNode* to, typename FeatureList::iterator pos)
240  {
241  to->features.splice(pos, features);
242  }
243 
250  void spliceElementTo(typename FeatureList::iterator it, BicoNode* to, typename FeatureList::iterator pos)
251  {
252  to->features.splice(pos, features, it);
253  }
254 
259  int id()
260  {
261  return objectId;
262  }
263 
264  private:
268  int objectId;
269 
274 
279  };
280 
281 public:
296  Bico(size_t dimension, size_t n, size_t k, size_t p, size_t nMax,
298 
303  virtual ProxySolution<T>* compute();
304 
312  virtual Bico<T>& operator<<(T const & element);
313 
318  void print(std::ostream& os);
319 
320 private:
327  void insert(BicoNode* node, int level, T const & element);
328 
334  void allocateBucket(int bucket, bool left);
335 
342  int calcBucketNumber(int rnd, double val);
343 
347  void buildBuckets();
348 
355  double project(T point, int i);
356 
360  void rebuild();
361 
367  void rebuildFirstLevel(BicoNode* parent, BicoNode* child);
368 
374  void rebuildTraversMerge(BicoNode* node, int level);
375 
381  void computeTraverse(BicoNode* node, ProxySolution<T>* solution);
382 
388  double getT(int level);
389 
395  double getR(int level);
396 
402  void print(std::ostream& os, BicoNode* node);
403 
407  size_t k;
411  size_t L;
412 
416  std::vector<std::vector<double >> rndprojections;
417 
421  std::vector<std::vector<std::vector<typename BicoNode::FeatureList::iterator >> > buckets;
425  std::vector<std::pair<double, double >> borders;
429  std::vector<int> bucket_radius;
430 
435 
439  std::unique_ptr<DissimilarityMeasure<T >> measure;
440 
444  std::unique_ptr<WeightModifier<T >> weightModifier;
445 
449  size_t maxNumOfCFs;
450 
454  size_t curNumOfCFs;
455 
459  size_t dimension;
460 
464  double optEst;
465 
469  std::vector<double> maxVal;
470 
475 
480 
484  std::vector<T> buffer;
485 
490 };
491 
492 template<typename T> Bico<T>::Bico(size_t dim, size_t n, size_t k, size_t p, size_t nMax,
493  DissimilarityMeasure<T>* measure, WeightModifier<T>* weightModifier) :
494 nodeIdCounter(0),
495 measure(measure->clone()),
496 weightModifier(weightModifier->clone()),
497 maxNumOfCFs(nMax),
498 curNumOfCFs(0),
499 k(k),
500 L(p),
501 optEst(-1),
502 root(new BicoNode(*this)),
503 bufferPhase(true),
504 numOfRebuilds(0),
505 buffer(),
506 dimension(dim)
507 {
509  std::vector<double> rndpoint(dimension);
510  rndprojections.resize(L);
511  bucket_radius.resize(L);
512  maxVal.resize(L);
513  double norm;
514  std::normal_distribution<double> realDist(0.0, 1.0);
515  for (int i = 0; i < L; i++)
516  {
517  maxVal[i] = -1;
518  norm = 0.0;
519  for (int j = 0; j < dimension; j++)
520  {
521  rndpoint[j] = realDist(rg);
522  norm += rndpoint[j] * rndpoint[j];
523  }
524  norm = std::sqrt(norm);
525  for (int j = 0; j < dimension; j++)
526  {
527  rndpoint[j] /= norm;
528  }
529  rndprojections[i] = rndpoint;
530  }
531  buckets.resize(L);
532  buffer.reserve(maxNumOfCFs + 1);
533 }
534 
535 template<typename T> int Bico<T>::calcBucketNumber(int rnd, double val)
536 {
537  return (int) floor((val - borders[rnd].first) / bucket_radius[rnd]);
538 }
539 
540 template<typename T> void Bico<T>::buildBuckets()
541 {
542  double Size = 0;
543  for (int i = 0; i < L; i++)
544  {
545  // Compute new bucket size
546  if (buckets[i].size() == 1)
547  {
548  Size = 1;
549  }
550  else
551  {
552  bucket_radius[i] = (int) ceil(sqrt(getR(1)));
553  Size = (int) ceil((borders[i].second - borders[i].first) / (double) bucket_radius[i]);
554  }
555  for (int l = 0; l < buckets[i].size(); l++) buckets[i][l].clear();
556  // Create new buckets
557  buckets[i].clear();
558  buckets[i].resize((int) ceil(Size));
559  }
560 }
561 
562 template<typename T> void Bico<T>::allocateBucket(int bucket, bool left)
563 {
564  if (left)
565  {
566  // Push front bucket
567  borders[bucket].first = 2 * borders[bucket].first - borders[bucket].second;
568  std::vector < std::vector<typename BicoNode::FeatureList::iterator >> a(2 * buckets[bucket].size());
569  for (int i = 0; i < buckets[bucket].size(); i++)
570  {
571  a[buckets[bucket].size() + i] = buckets[bucket][i];
572  }
573  for (int l = 0; l < buckets[bucket].size(); l++) buckets[bucket][l].clear();
574  buckets[bucket].clear();
575  buckets[bucket] = a;
576  }
577  else
578  {
579  // Push back bucket
580  borders[bucket].second = 2 * borders[bucket].second - borders[bucket].first;
581  std::vector < std::vector<typename BicoNode::FeatureList::iterator >> a(2 * buckets[bucket].size());
582  for (int i = 0; i < buckets[bucket].size(); i++)
583  {
584  a[i] = buckets[bucket][i];
585  }
586  for (int l = 0; l < buckets[bucket].size(); l++) buckets[bucket][l].clear();
587  buckets[bucket].clear();
588  buckets[bucket] = a;
589  }
590 }
591 
592 template<typename T> double Bico<T>::project(T point, int i)
593 {
594  double ip = 0.0;
595  for (int j = 0; j < dimension; j++)
596  {
597  ip += point[j]*(rndprojections[i][j]);
598  }
599  return ip;
600 }
601 
602 template<typename T> ProxySolution<T>* Bico<T>::compute()
603 {
604  ProxySolution<T>* result = new ProxySolution<T>();
605  result->proxysets.push_back(std::vector<T>());
606  result->proxysets[0].reserve(curNumOfCFs);
607  computeTraverse(root, result);
608  return result;
609 }
610 
611 template<typename T> void Bico<T>::computeTraverse(BicoNode* node, ProxySolution<T>* solution)
612 {
613  for (auto it = node->begin(); it != node->end(); ++it)
614  {
615  T point(it->first.cog());
616  weightModifier->setWeight(point, it->first.number);
617  solution->proxysets[0].push_back(point);
618  computeTraverse(it->second, solution);
619  }
620 }
621 
622 template<typename T> Bico<T>& Bico<T>::operator<<(T const & element)
623 {
624  if (bufferPhase)
625  {
626  static double minDist = std::numeric_limits<double>::infinity();
627  static size_t pairwise_different = 0;
628  // Find nearest neighbor
629  for (size_t i = 0; i < buffer.size(); ++i)
630  {
631  double tmpDist = measure->dissimilarity(buffer[i], element);
632  if (tmpDist > 0)
633  {
634  ++pairwise_different;
635  if (tmpDist < minDist)
636  minDist = tmpDist;
637  }
638  for (int i = 0; i < L; i++)
639  {
640  double val = std::abs(project(element, i));
641  if (val > maxVal[i] || maxVal[i] == -1)
642  {
643  maxVal[i] = val;
644  }
645  }
646 
647  }
648  buffer.push_back(element);
649 
650  // Enough pairwise different elements to estimate optimal cost?
651  if (pairwise_different >= maxNumOfCFs + 1)
652  {
653  optEst = 16.0 * minDist;
654  int radius = (int) ceil(sqrt(getR(1)));
655  borders.resize(L);
656  for (int i = 0; i < L; i++)
657  {
658  borders[i].first = -maxVal[i];
659  borders[i].second = maxVal[i];
660  bucket_radius[i] = radius;
661  }
662  buildBuckets();
663 
664  // Insert buffered elements into tree
665  for (auto it = buffer.begin(); it != buffer.end(); ++it)
666  insert(root, 1, *it);
667  buffer.resize(0);
668  bufferPhase = false;
669  }
670  }
671  else
672  insert(root, 1, element);
673  return *this;
674 }
675 
676 template<typename T> void Bico<T>::insert(BicoNode* node, int level, T const & element)
677 {
678 
679  if (optEst < 0)
680  throw (InvalidRuntimeConfigurationException(0, "Estimated optimal cost invalid"));
681 
682  // Determine nearest clustering feature in current node
683  typename BicoNode::FeatureList::iterator nearest(node->nearest(element, level));
684 
685 
686  // Construct new clustering feature if element is too far away from
687  // nearest clustering feature or insert element into nearest
688  if (node->empty() || nearest == node->end()
689  || measure->dissimilarity(nearest->first.representative, element) > getR(level))
690  {
691  CFREntry<T> feature(1, element, element * element, element);
692  typename BicoNode::FeatureList::iterator itele = node->insert(feature);
693 
694  if (level == 1)
695  {
696  for (int i = 0; i < L; i++)
697  {
698  double val = project(element, i);
699  int bucket_number = calcBucketNumber(i, val);
700 
701  if (bucket_number < 0)
702  {
703  while (bucket_number < 0)
704  {
705  allocateBucket(i, true);
706  bucket_number = calcBucketNumber(i, val);
707  }
708  }
709  else if (bucket_number > buckets[i].size() - 1)
710  {
711  while (bucket_number > buckets[i].size() - 1)
712  {
713  allocateBucket(i, false);
714  bucket_number = calcBucketNumber(i, val);
715  }
716  }
717  buckets[i][bucket_number].push_back(itele);
718  }
719  }
720 
721  ++curNumOfCFs;
722  }
723  else
724  {
725  T center(nearest->first.cog());
726  // Insert element into (a copy of) nearest and compute cost
727  // for insertion at current level
728  CFEntry<T> testFeature(nearest->first);
729  testFeature.insert(element);
730  double tfCost = testFeature.kMeansCost(center);
731 
732  // Insert element either to current level (if cost remains small)
733  // or to higher level
734  if (tfCost < getT(level))
735  {
736  nearest->first.insert(element);
737  }
738  else
739  {
740  insert(nearest->second, level + 1, element);
741  }
742  }
743 
744  // Rebuild?
745  while (curNumOfCFs > maxNumOfCFs)
746  {
747  rebuild();
748  }
749 }
750 
751 template<typename T> void Bico<T>::rebuild()
752 {
753  // Rebuild first level
754  BicoNode * oldRoot(this->root);
755  this->root = new BicoNode(*this);
756  rebuildFirstLevel(this->root, oldRoot);
757  oldRoot->clear();
758  delete oldRoot;
759 
760  // Traverse through tree and merge
761  for (auto it = this->root->begin(); it != this->root->end(); ++it)
762  rebuildTraversMerge(it->second, 2);
763 }
764 
765 template<typename T> void Bico<T>::rebuildFirstLevel(BicoNode* parent, BicoNode* child)
766 {
767  optEst *= 2.0;
768  ++numOfRebuilds;
769 
770  buildBuckets();
771 
772  // The current element it may be spliced around the tree, so nextIt
773  // will maintain an iterator pointing at the next element in child
774  auto nextIt = child->begin();
775  for (auto it = child->begin(); it != child->end(); it = nextIt)
776  {
777  ++nextIt;
778  // Determine clustering feature in parent that is nearest to child
779  typename BicoNode::FeatureList::iterator nearest(parent->nearest(it->first.representative, 1));
780  if (parent->empty() || nearest == parent->end()
781  || measure->dissimilarity(nearest->first.representative, it->first.representative) > getR(1))
782  {
783  // Move it from child to parent
784  child->spliceElementTo(it, parent, parent->end());
785 
786  for (int i = 0; i < L; i++)
787  {
788  double val = project(it->first.representative, i);
789  int bucket_number = calcBucketNumber(i, val);
790  if (bucket_number < 0)
791  {
792  while (bucket_number < 0)
793  {
794  allocateBucket(i, true);
795  bucket_number = calcBucketNumber(i, val);
796  }
797  }
798  else if (bucket_number > buckets[i].size() - 1)
799  {
800  while (bucket_number > buckets[i].size() - 1)
801  {
802  allocateBucket(i, false);
803  bucket_number = calcBucketNumber(i, val);
804  }
805  }
806  buckets[i][bucket_number].push_back(it);
807  }
808  }
809  else
810  {
811  CFEntry<T> testFeature(it->first);
812  testFeature += nearest->first;
813  if (testFeature.kMeansCost(nearest->first.representative) <= getT(1))
814  {
815  // Insert it into nearest
816  nearest->first += it->first;
817  // Make children of it children of nearest
818  it->second->spliceAllTo(nearest->second, nearest->second->end());
819  // Delete (now) empty child node of it
820  it->second->clear();
821  delete it->second;
822  // Remove it from tree and delete it
823  child->erase(it);
824  --curNumOfCFs;
825  }
826  else
827  {
828  // Make it a child of nearest
829  child->spliceElementTo(it, nearest->second, nearest->second->end());
830  }
831  }
832  }
833 }
834 
835 template<typename T> void Bico<T>::rebuildTraversMerge(BicoNode* node, int level)
836 {
837  for (auto parentIt = node->begin(); parentIt != node->end(); ++parentIt)
838  {
839  if (!parentIt->second->empty())
840  {
841  auto nextIt = parentIt->second->begin();
842  for (auto childIt = parentIt->second->begin(); childIt != parentIt->second->end(); childIt = nextIt)
843  {
844  ++nextIt;
845 
846  T center(parentIt->first.cog());
847  // Insert element into (a copy of) nearest and compute cost
848  // for insertion at current level
849  CFEntry<T> testFeature(parentIt->first + childIt->first);
850  double tfCost = testFeature.kMeansCost(center);
851 
852  // Merge if possible
853  if (tfCost < getT(level))
854  {
855  parentIt->first += childIt->first;
856  childIt->second->spliceAllTo(parentIt->second, parentIt->second->end());
857  childIt->second->clear();
858  delete childIt->second;
859  parentIt->second->erase(childIt);
860  --curNumOfCFs;
861  }
862  else
863  {
864  rebuildTraversMerge(childIt->second, level + 1);
865  }
866  }
867  }
868  }
869 }
870 
871 template<typename T> double Bico<T>::getT(int level)
872 {
873  return optEst;
874 }
875 
876 template<typename T> double Bico<T>::getR(int level)
877 {
878  return getT(level) / (double(1 << (3 + level)));
879 }
880 
881 template<typename T> void Bico<T>::print(std::ostream& os)
882 {
883  os << "digraph G{\n";
884  os << "node [shape=record];\n";
885  print(os, root);
886  os << "}\n";
887 }
888 
889 template<typename T> void Bico<T>::print(std::ostream& os, BicoNode* node)
890 {
891  int id = node->id();
892 
893  os << "node" << id << "[label=\"";
894  int fvalue = 0;
895  os << node->id() << "|";
896  for (auto it = node->begin(); it != node->end(); ++it)
897  {
898  if (fvalue > 0)
899  os << "|";
900  os << "<f" << fvalue << "> " << it->first.number << "," << it->first.representative
901  << "\\n" << it->first.LS << "," << it->first.SS;
902  fvalue++;
903  }
904  os << "\"];\n";
905 
906  fvalue = 0;
907  for (auto it = node->begin(); it != node->end(); ++it)
908  {
909  print(os, it->second);
910  os << "node" << id << ":f" << fvalue << " -> node" << it->second->id() << ";\n";
911  fvalue++;
912  }
913 }
914 
915 }
916 
917 #endif