ATTPCROOT  0.3.0-alpha
A ROOT-based framework for analyzing data from active target detectors
fastcluster_dm.cxx
Go to the documentation of this file.
1 /*
2  fastcluster: Fast hierarchical clustering routines for R and Python
3 
4  Copyright © 2011 Daniel Müllner
5  <http://danifold.net>
6 
7  This library implements various fast algorithms for hierarchical,
8  agglomerative clustering methods:
9 
10  (1) Algorithms for the "stored matrix approach": the input is the array of
11  pairwise dissimilarities.
12 
13  MST_linkage_core: single linkage clustering with the "minimum spanning
14  tree algorithm (Rohlfs)
15 
16  NN_chain_core: nearest-neighbor-chain algorithm, suitable for single,
17  complete, average, weighted and Ward linkage (Murtagh)
18 
19  generic_linkage: generic algorithm, suitable for all distance update
20  formulas (Müllner)
21 
22  (2) Algorithms for the "stored data approach": the input are points in a
23  vector space.
24 
25  MST_linkage_core_vector: single linkage clustering for vector data
26 
27  generic_linkage_vector: generic algorithm for vector data, suitable for
28  the Ward, centroid and median methods.
29 
30  generic_linkage_vector_alternative: alternative scheme for updating the
31  nearest neighbors. This method seems faster than "generic_linkage_vector"
32  for the centroid and median methods but slower for the Ward method.
33 
34  All these implementation treat infinity values correctly. They throw an
35  exception if a NaN distance value occurs.
36 */
37 
38 // Older versions of Microsoft Visual Studio do not have the fenv header.
39 #ifdef _MSC_VER
40 #if (_MSC_VER == 1500 || _MSC_VER == 1600)
41 #define NO_INCLUDE_FENV
42 #endif
43 #endif
44 // NaN detection via fenv might not work on systems with software
45 // floating-point emulation (bug report for Debian armel).
46 #ifdef __SOFTFP__
47 #define NO_INCLUDE_FENV
48 #endif
49 #ifdef NO_INCLUDE_FENV
50 #pragma message("Do not use fenv header.")
51 #else
52 #pragma message("Use fenv header. If there is a warning about unknown #pragma STDC FENV_ACCESS, this can be ignored.")
53 #pragma STDC FENV_ACCESS on
54 #include <fenv.h>
55 #endif
56 
57 #include <algorithm> // for std::fill_n
58 #include <cfloat> // also for DBL_MAX, DBL_MIN
59 #include <cmath> // for std::pow, std::sqrt
60 #include <cstddef> // for std::ptrdiff_t
61 #include <limits> // for std::numeric_limits<...>::infinity()
62 #include <stdexcept> // for std::runtime_error
63 #include <string> // for std::string
64 #ifndef DBL_MANT_DIG
65 #error The constant DBL_MANT_DIG could not be defined.
66 #endif
67 #define T_FLOAt_MANT_DIG DBL_MANT_DIG
68 
69 #ifndef LONG_MAX
70 #include <climits>
71 #endif
72 #ifndef LONG_MAX
73 #error The constant LONG_MAX could not be defined.
74 #endif
75 #ifndef INT_MAX
76 #error The constant INT_MAX could not be defined.
77 #endif
78 
79 #ifndef INT32_MAX
80 #ifdef _MSC_VER
81 #if _MSC_VER >= 1600
82 #define __STDC_LIMIT_MACROS
83 #include <stdint.h>
84 #else
85 typedef __int32 int_fast32_t;
86 typedef __int64 int64_t;
87 #endif
88 #else
89 #define __STDC_LIMIT_MACROS
90 #include <stdint.h>
91 #endif
92 #endif
93 
94 #define FILL_N std::fill_n
95 #ifdef _MSC_VER
96 #if _MSC_VER < 1600
97 #undef FILL_N
98 #define FILL_N stdext::unchecked_fill_n
99 #endif
100 #endif
101 
102 // Suppress warnings about (potentially) uninitialized variables.
103 #ifdef _MSC_VER
104 #pragma warning(disable : 4700)
105 #endif
106 
107 #ifndef HAVE_DIAGNOSTIC
108 #if __GNUC__ > 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 6))
109 #define HAVE_DIAGNOSTIC 1
110 #endif
111 #endif
112 
113 #ifndef HAVE_VISIBILITY
114 #if __GNUC__ >= 4
115 #define HAVE_VISIBILITY 1
116 #endif
117 #endif
118 
119 /* Since the public interface is given by the Python respectively R interface,
120  * we do not want other symbols than the interface initalization routines to be
121  * visible in the shared object file. The "visibility" switch is a GCC concept.
122  * Hiding symbols keeps the relocation table small and decreases startup time.
123  * See http://gcc.gnu.org/wiki/Visibility
124  */
125 #if HAVE_VISIBILITY
126 #pragma GCC visibility push(hidden)
127 #endif
128 
129 typedef int_fast32_t t_index;
130 #ifndef INT32_MAX
131 #define MAX_INDEX 0x7fffffffL
132 #else
133 #define MAX_INDEX INT32_MAX
134 #endif
135 #if (LONG_MAX < MAX_INDEX)
136 #error The integer format "t_index" must not have a greater range than "long int".
137 #endif
138 #if (INT_MAX > MAX_INDEX)
139 #error The integer format "int" must not have a greater range than "t_index".
140 #endif
141 typedef double t_float;
142 
143 /* Method codes.
144 
145  These codes must agree with the METHODS array in fastcluster.R and the
146  dictionary mthidx in fastcluster.py.
147 */
149  // non-Euclidean methods
159 
161  MAX_METHOD_CODE = 7
162 };
163 
165  // Euclidean methods
170 
173 };
174 
175 // self-destructing array pointer
176 template <typename type>
178 private:
179  type *ptr;
180  auto_array_ptr(auto_array_ptr const &); // non construction-copyable
181  auto_array_ptr &operator=(auto_array_ptr const &); // non copyable
182 public:
183  auto_array_ptr() : ptr(NULL) {}
184  template <typename index>
185  auto_array_ptr(index const size) : ptr(new type[size])
186  {
187  }
188  template <typename index, typename value>
189  auto_array_ptr(index const size, value const val) : ptr(new type[size])
190  {
191  FILL_N(ptr, size, val);
192  }
193  ~auto_array_ptr() { delete[] ptr; }
194  void free()
195  {
196  delete[] ptr;
197  ptr = NULL;
198  }
199  template <typename index>
200  void init(index const size)
201  {
202  ptr = new type[size];
203  }
204  template <typename index, typename value>
205  void init(index const size, value const val)
206  {
207  init(size);
208  FILL_N(ptr, size, val);
209  }
210  inline operator type *() const { return ptr; }
211 };
212 
213 struct node {
216 };
217 
218 inline bool operator<(const node a, const node b)
219 {
220  return (a.dist < b.dist);
221 }
222 
224 private:
226  t_index pos;
227 
228 public:
229  cluster_result(const t_index size) : Z(size), pos(0) {}
230 
231  void append(const t_index node1, const t_index node2, const t_float dist)
232  {
233  Z[pos].node1 = node1;
234  Z[pos].node2 = node2;
235  Z[pos].dist = dist;
236  ++pos;
237  }
238 
239  node *operator[](const t_index idx) const { return Z + idx; }
240 
241  /* Define several methods to postprocess the distances. All these functions
242  are monotone, so they do not change the sorted order of distances. */
243 
244  void sqrt() const
245  {
246  for (node *ZZ = Z; ZZ != Z + pos; ++ZZ) {
247  ZZ->dist = std::sqrt(ZZ->dist);
248  }
249  }
250 
251  void sqrt(const t_float) const
252  { // ignore the argument
253  sqrt();
254  }
255 
256  void sqrtdouble(const t_float) const
257  { // ignore the argument
258  for (node *ZZ = Z; ZZ != Z + pos; ++ZZ) {
259  ZZ->dist = std::sqrt(2 * ZZ->dist);
260  }
261  }
262 
263 #ifdef R_pow
264 #define my_pow R_pow
265 #else
266 #define my_pow std::pow
267 #endif
268 
269  void power(const t_float p) const
270  {
271  t_float const q = 1 / p;
272  for (node *ZZ = Z; ZZ != Z + pos; ++ZZ) {
273  ZZ->dist = my_pow(ZZ->dist, q);
274  }
275  }
276 
277  void plusone(const t_float) const
278  { // ignore the argument
279  for (node *ZZ = Z; ZZ != Z + pos; ++ZZ) {
280  ZZ->dist += 1;
281  }
282  }
283 
284  void divide(const t_float denom) const
285  {
286  for (node *ZZ = Z; ZZ != Z + pos; ++ZZ) {
287  ZZ->dist /= denom;
288  }
289  }
290 };
291 
293  /*
294  Class for a doubly linked list. Initially, the list is the integer range
295  [0, size]. We provide a forward iterator and a method to delete an index
296  from the list.
297 
298  Typical use: for (i=L.start; L<size; i=L.succ[I])
299  or
300  for (i=somevalue; L<size; i=L.succ[I])
301  */
302 public:
305 
306 private:
308  // Not necessarily private, we just do not need it in this instance.
309 
310 public:
312  // Initialize to the given size.
313  : start(0), succ(size + 1), pred(size + 1)
314  {
315  for (t_index i = 0; i < size; ++i) {
316  pred[i + 1] = i;
317  succ[i] = i + 1;
318  }
319  // pred[0] is never accessed!
320  // succ[size] is never accessed!
321  }
322 
324 
325  void remove(const t_index idx)
326  {
327  // Remove an index from the list.
328  if (idx == start) {
329  start = succ[idx];
330  } else {
331  succ[pred[idx]] = succ[idx];
332  pred[succ[idx]] = pred[idx];
333  }
334  succ[idx] = 0; // Mark as inactive
335  }
336 
337  bool is_inactive(t_index idx) const { return (succ[idx] == 0); }
338 };
339 
340 // Indexing functions
341 // D is the upper triangular part of a symmetric (NxN)-matrix
342 // We require r_ < c_ !
343 #define D_(r_, c_) (D[(static_cast<std::ptrdiff_t>(2 * N - 3 - (r_)) * (r_) >> 1) + (c_)-1])
344 // Z is an ((N-1)x4)-array
345 #define Z_(_r, _c) (Z[(_r)*4 + (_c)])
346 
347 /*
348  Lookup function for a union-find data structure.
349 
350  The function finds the root of idx by going iteratively through all
351  parent elements until a root is found. An element i is a root if
352  nodes[i] is zero. To make subsequent searches faster, the entry for
353  idx and all its parents is updated with the root element.
354  */
355 class union_find {
356 private:
358  t_index nextparent;
359 
360 public:
361  union_find(const t_index size) : parent(size > 0 ? 2 * size - 1 : 0, 0), nextparent(size) {}
362 
363  t_index Find(t_index idx) const
364  {
365  // NOLINTNEXTLINE
366  if (parent[idx] != 0) { // a → b
367  t_index p = idx;
368  idx = parent[idx];
369  if (parent[idx] != 0) { // a → b → c
370  do {
371  idx = parent[idx];
372  } while (parent[idx] != 0);
373  do {
374  t_index tmp = parent[p];
375  parent[p] = idx;
376  p = tmp;
377  } while (parent[p] != idx);
378  }
379  }
380  return idx;
381  }
382 
383  void Union(const t_index node1, const t_index node2) { parent[node1] = parent[node2] = nextparent++; }
384 };
385 
386 class nan_error {
387 };
388 #ifdef FE_INVALID
389 class fenv_error {
390 };
391 #endif
392 
393 static void MST_linkage_core(const t_index N, const t_float *const D, cluster_result &Z2)
394 {
395  /*
396  N: integer, number of data points
397  D: condensed distance matrix N*(N-1)/2
398  Z2: output data structure
399 
400  The basis of this algorithm is an algorithm by Rohlf:
401 
402  F. James Rohlf, Hierarchical clustering using the minimum spanning tree,
403  The Computer Journal, vol. 16, 1973, p. 93–95.
404  */
405  t_index i;
406  t_index idx2;
407  doubly_linked_list active_nodes(N);
409 
410  t_index prev_node;
411  t_float min;
412 
413  // first iteration
414  idx2 = 1;
415  min = std::numeric_limits<t_float>::infinity();
416  for (i = 1; i < N; ++i) {
417  d[i] = D[i - 1];
418 #if HAVE_DIAGNOSTIC
419 #pragma GCC diagnostic push
420 #pragma GCC diagnostic ignored "-Wfloat-equal"
421 #endif
422  if (d[i] < min) {
423  min = d[i];
424  idx2 = i;
425  } else if (fc_isnan(d[i]))
426  throw(nan_error());
427 #if HAVE_DIAGNOSTIC
428 #pragma GCC diagnostic pop
429 #endif
430  }
431  Z2.append(0, idx2, min);
432 
433  for (t_index j = 1; j < N - 1; ++j) {
434  prev_node = idx2;
435  active_nodes.remove(prev_node);
436 
437  idx2 = active_nodes.succ[0];
438  min = d[idx2];
439  for (i = idx2; i < prev_node; i = active_nodes.succ[i]) {
440  t_float tmp = D_(i, prev_node);
441 #if HAVE_DIAGNOSTIC
442 #pragma GCC diagnostic push
443 #pragma GCC diagnostic ignored "-Wfloat-equal"
444 #endif
445  if (tmp < d[i])
446  d[i] = tmp;
447  else if (fc_isnan(tmp))
448  throw(nan_error());
449 #if HAVE_DIAGNOSTIC
450 #pragma GCC diagnostic pop
451 #endif
452  if (d[i] < min) {
453  min = d[i];
454  idx2 = i;
455  }
456  }
457  for (; i < N; i = active_nodes.succ[i]) {
458  t_float tmp = D_(prev_node, i);
459 #if HAVE_DIAGNOSTIC
460 #pragma GCC diagnostic push
461 #pragma GCC diagnostic ignored "-Wfloat-equal"
462 #endif
463  if (d[i] > tmp)
464  d[i] = tmp;
465  else if (fc_isnan(tmp))
466  throw(nan_error());
467 #if HAVE_DIAGNOSTIC
468 #pragma GCC diagnostic pop
469 #endif
470  if (d[i] < min) {
471  min = d[i];
472  idx2 = i;
473  }
474  }
475  Z2.append(prev_node, idx2, min);
476  }
477 }
478 
479 /* Functions for the update of the dissimilarity array */
480 
481 inline static void f_single(t_float *const b, const t_float a)
482 {
483  if (*b > a)
484  *b = a;
485 }
486 inline static void f_complete(t_float *const b, const t_float a)
487 {
488  if (*b < a)
489  *b = a;
490 }
491 inline static void f_average(t_float *const b, const t_float a, const t_float s, const t_float t)
492 {
493  *b = s * a + t * (*b);
494 #ifndef FE_INVALID
495 #if HAVE_DIAGNOSTIC
496 #pragma GCC diagnostic push
497 #pragma GCC diagnostic ignored "-Wfloat-equal"
498 #endif
499  if (fc_isnan(*b)) {
500  throw(nan_error());
501  }
502 #if HAVE_DIAGNOSTIC
503 #pragma GCC diagnostic pop
504 #endif
505 #endif
506 }
507 inline static void f_weighted(t_float *const b, const t_float a)
508 {
509  *b = (a + *b) * .5;
510 #ifndef FE_INVALID
511 #if HAVE_DIAGNOSTIC
512 #pragma GCC diagnostic push
513 #pragma GCC diagnostic ignored "-Wfloat-equal"
514 #endif
515  if (fc_isnan(*b)) {
516  throw(nan_error());
517  }
518 #if HAVE_DIAGNOSTIC
519 #pragma GCC diagnostic pop
520 #endif
521 #endif
522 }
523 inline static void
524 f_ward(t_float *const b, const t_float a, const t_float c, const t_float s, const t_float t, const t_float v)
525 {
526  *b = ((v + s) * a - v * c + (v + t) * (*b)) / (s + t + v);
527 //*b = a+(*b)-(t*a+s*(*b)+v*c)/(s+t+v);
528 #ifndef FE_INVALID
529 #if HAVE_DIAGNOSTIC
530 #pragma GCC diagnostic push
531 #pragma GCC diagnostic ignored "-Wfloat-equal"
532 #endif
533  if (fc_isnan(*b)) {
534  throw(nan_error());
535  }
536 #if HAVE_DIAGNOSTIC
537 #pragma GCC diagnostic pop
538 #endif
539 #endif
540 }
541 inline static void f_centroid(t_float *const b, const t_float a, const t_float stc, const t_float s, const t_float t)
542 {
543  *b = s * a - stc + t * (*b);
544 #ifndef FE_INVALID
545  if (fc_isnan(*b)) {
546  throw(nan_error());
547  }
548 #if HAVE_DIAGNOSTIC
549 #pragma GCC diagnostic pop
550 #endif
551 #endif
552 }
553 inline static void f_median(t_float *const b, const t_float a, const t_float c_4)
554 {
555  *b = (a + (*b)) * .5 - c_4;
556 #ifndef FE_INVALID
557 #if HAVE_DIAGNOSTIC
558 #pragma GCC diagnostic push
559 #pragma GCC diagnostic ignored "-Wfloat-equal"
560 #endif
561  if (fc_isnan(*b)) {
562  throw(nan_error());
563  }
564 #if HAVE_DIAGNOSTIC
565 #pragma GCC diagnostic pop
566 #endif
567 #endif
568 }
569 
570 template <method_codes method, typename t_members>
571 static void NN_chain_core(const t_index N, t_float *const D, t_members *const members, cluster_result &Z2)
572 {
573  /*
574  N: integer
575  D: condensed distance matrix N*(N-1)/2
576  Z2: output data structure
577 
578  This is the NN-chain algorithm, described on page 86 in the following book:
579 
580  Fionn Murtagh, Multidimensional Clustering Algorithms,
581  Vienna, Würzburg: Physica-Verlag, 1985.
582  */
583  t_index i;
584 
585  auto_array_ptr<t_index> NN_chain(N);
586  t_index NN_chain_tip = 0;
587 
588  t_index idx1, idx2;
589 
590  t_float size1, size2;
591  doubly_linked_list active_nodes(N);
592 
593  t_float min;
594 
595  for (t_float const *DD = D; DD != D + (static_cast<std::ptrdiff_t>(N) * (N - 1) >> 1); ++DD) {
596 #if HAVE_DIAGNOSTIC
597 #pragma GCC diagnostic push
598 #pragma GCC diagnostic ignored "-Wfloat-equal"
599 #endif
600  if (fc_isnan(*DD)) {
601  throw(nan_error());
602  }
603 #if HAVE_DIAGNOSTIC
604 #pragma GCC diagnostic pop
605 #endif
606  }
607 
608 #ifdef FE_INVALID
609  if (feclearexcept(FE_INVALID))
610  throw fenv_error();
611 #endif
612 
613  for (t_index j = 0; j < N - 1; ++j) {
614  if (NN_chain_tip <= 3) {
615  NN_chain[0] = idx1 = active_nodes.start;
616  NN_chain_tip = 1;
617 
618  idx2 = active_nodes.succ[idx1];
619  min = D_(idx1, idx2);
620  for (i = active_nodes.succ[idx2]; i < N; i = active_nodes.succ[i]) {
621  if (D_(idx1, i) < min) {
622  min = D_(idx1, i);
623  idx2 = i;
624  }
625  }
626  } // a: idx1 b: idx2
627  else {
628  NN_chain_tip -= 3;
629  idx1 = NN_chain[NN_chain_tip - 1];
630  idx2 = NN_chain[NN_chain_tip];
631  min = idx1 < idx2 ? D_(idx1, idx2) : D_(idx2, idx1);
632  } // a: idx1 b: idx2
633 
634  do {
635  NN_chain[NN_chain_tip] = idx2;
636 
637  for (i = active_nodes.start; i < idx2; i = active_nodes.succ[i]) {
638  if (D_(i, idx2) < min) {
639  min = D_(i, idx2);
640  idx1 = i;
641  }
642  }
643  for (i = active_nodes.succ[idx2]; i < N; i = active_nodes.succ[i]) {
644  if (D_(idx2, i) < min) {
645  min = D_(idx2, i);
646  idx1 = i;
647  }
648  }
649 
650  idx2 = idx1;
651  idx1 = NN_chain[NN_chain_tip++];
652 
653  } while (idx2 != NN_chain[NN_chain_tip - 2]);
654 
655  Z2.append(idx1, idx2, min);
656 
657  if (idx1 > idx2) {
658  t_index tmp = idx1;
659  idx1 = idx2;
660  idx2 = tmp;
661  }
662 
663  if (method == METHOD_METR_AVERAGE || method == METHOD_METR_WARD) {
664  size1 = static_cast<t_float>(members[idx1]);
665  size2 = static_cast<t_float>(members[idx2]);
666  members[idx2] += members[idx1];
667  }
668 
669  // Remove the smaller index from the valid indices (active_nodes).
670  active_nodes.remove(idx1);
671 
672  switch (method) {
673  case METHOD_METR_SINGLE:
674  /*
675  Single linkage.
676 
677  Characteristic: new distances are never longer than the old distances.
678  */
679  // Update the distance matrix in the range [start, idx1).
680  for (i = active_nodes.start; i < idx1; i = active_nodes.succ[i])
681  f_single(&D_(i, idx2), D_(i, idx1));
682  // Update the distance matrix in the range (idx1, idx2).
683  for (; i < idx2; i = active_nodes.succ[i])
684  f_single(&D_(i, idx2), D_(idx1, i));
685  // Update the distance matrix in the range (idx2, N).
686  for (i = active_nodes.succ[idx2]; i < N; i = active_nodes.succ[i])
687  f_single(&D_(idx2, i), D_(idx1, i));
688  break;
689 
691  /*
692  Complete linkage.
693 
694  Characteristic: new distances are never shorter than the old distances.
695  */
696  // Update the distance matrix in the range [start, idx1).
697  for (i = active_nodes.start; i < idx1; i = active_nodes.succ[i])
698  f_complete(&D_(i, idx2), D_(i, idx1));
699  // Update the distance matrix in the range (idx1, idx2).
700  for (; i < idx2; i = active_nodes.succ[i])
701  f_complete(&D_(i, idx2), D_(idx1, i));
702  // Update the distance matrix in the range (idx2, N).
703  for (i = active_nodes.succ[idx2]; i < N; i = active_nodes.succ[i])
704  f_complete(&D_(idx2, i), D_(idx1, i));
705  break;
706 
707  case METHOD_METR_AVERAGE: {
708  /*
709  Average linkage.
710 
711  Shorter and longer distances can occur.
712  */
713  // Update the distance matrix in the range [start, idx1).
714  t_float s = size1 / (size1 + size2);
715  t_float t = size2 / (size1 + size2);
716  for (i = active_nodes.start; i < idx1; i = active_nodes.succ[i])
717  f_average(&D_(i, idx2), D_(i, idx1), s, t);
718  // Update the distance matrix in the range (idx1, idx2).
719  for (; i < idx2; i = active_nodes.succ[i])
720  f_average(&D_(i, idx2), D_(idx1, i), s, t);
721  // Update the distance matrix in the range (idx2, N).
722  for (i = active_nodes.succ[idx2]; i < N; i = active_nodes.succ[i])
723  f_average(&D_(idx2, i), D_(idx1, i), s, t);
724  break;
725  }
726 
728  /*
729  Weighted linkage.
730 
731  Shorter and longer distances can occur.
732  */
733  // Update the distance matrix in the range [start, idx1).
734  for (i = active_nodes.start; i < idx1; i = active_nodes.succ[i])
735  f_weighted(&D_(i, idx2), D_(i, idx1));
736  // Update the distance matrix in the range (idx1, idx2).
737  for (; i < idx2; i = active_nodes.succ[i])
738  f_weighted(&D_(i, idx2), D_(idx1, i));
739  // Update the distance matrix in the range (idx2, N).
740  for (i = active_nodes.succ[idx2]; i < N; i = active_nodes.succ[i])
741  f_weighted(&D_(idx2, i), D_(idx1, i));
742  break;
743 
744  case METHOD_METR_WARD:
745  /*
746  Ward linkage.
747 
748  Shorter and longer distances can occur, not smaller than min(d1,d2)
749  but maybe bigger than max(d1,d2).
750  */
751  // Update the distance matrix in the range [start, idx1).
752  // t_float v = static_cast<t_float>(members[i]);
753  for (i = active_nodes.start; i < idx1; i = active_nodes.succ[i])
754  f_ward(&D_(i, idx2), D_(i, idx1), min, size1, size2, static_cast<t_float>(members[i]));
755  // Update the distance matrix in the range (idx1, idx2).
756  for (; i < idx2; i = active_nodes.succ[i])
757  f_ward(&D_(i, idx2), D_(idx1, i), min, size1, size2, static_cast<t_float>(members[i]));
758  // Update the distance matrix in the range (idx2, N).
759  for (i = active_nodes.succ[idx2]; i < N; i = active_nodes.succ[i])
760  f_ward(&D_(idx2, i), D_(idx1, i), min, size1, size2, static_cast<t_float>(members[i]));
761  break;
762 
763  default: throw std::runtime_error(std::string("Invalid method."));
764  }
765  }
766 #ifdef FE_INVALID
767  if (fetestexcept(FE_INVALID))
768  throw fenv_error();
769 #endif
770 }
771 
773  /*
774  Class for a binary min-heap. The data resides in an array A. The elements of
775  A are not changed but two lists I and R of indices are generated which point
776  to elements of A and backwards.
777 
778  The heap tree structure is
779 
780  H[2*i+1] H[2*i+2]
781  \ /
782  \ /
783  ≤ ≤
784  \ /
785  \ /
786  H[i]
787 
788  where the children must be less or equal than their parent. Thus, H[0]
789  contains the minimum. The lists I and R are made such that H[i] = A[I[i]]
790  and R[I[i]] = i.
791 
792  This implementation is not designed to handle NaN values.
793  */
794 private:
795  t_float *const A;
796  t_index size;
799 
800  // no default constructor
801  binary_min_heap();
802  // noncopyable
804  binary_min_heap &operator=(binary_min_heap const &);
805 
806 public:
807  binary_min_heap(t_float *const A_, const t_index size_) : A(A_), size(size_), I(size), R(size)
808  { // Allocate memory and initialize the lists I and R to the identity. This
809  // does not make it a heap. Call heapify afterwards!
810  for (t_index i = 0; i < size; ++i)
811  R[i] = I[i] = i;
812  }
813 
814  binary_min_heap(t_float *const A_, const t_index size1, const t_index size2, const t_index start)
815  : A(A_), size(size1), I(size1), R(size2)
816  { // Allocate memory and initialize the lists I and R to the identity. This
817  // does not make it a heap. Call heapify afterwards!
818  for (t_index i = 0; i < size; ++i) {
819  R[i + start] = i;
820  I[i] = i + start;
821  }
822  }
823 
825 
826  void heapify()
827  {
828  // Arrange the indices I and R so that H[i] := A[I[i]] satisfies the heap
829  // condition H[i] < H[2*i+1] and H[i] < H[2*i+2] for each i.
830  //
831  // Complexity: Θ(size)
832  // Reference: Cormen, Leiserson, Rivest, Stein, Introduction to Algorithms,
833  // 3rd ed., 2009, Section 6.3 “Building a heap”
834  t_index idx;
835  for (idx = (size >> 1); idx > 0;) {
836  --idx;
837  update_geq_(idx);
838  }
839  }
840 
841  inline t_index argmin() const
842  {
843  // Return the minimal element.
844  return I[0];
845  }
846 
847  void heap_pop()
848  {
849  // Remove the minimal element from the heap.
850  --size;
851  I[0] = I[size];
852  R[I[0]] = 0;
853  update_geq_(0);
854  }
855 
856  void remove(t_index idx)
857  {
858  // Remove an element from the heap.
859  --size;
860  R[I[size]] = R[idx];
861  I[R[idx]] = I[size];
862  if (H(size) <= A[idx]) {
863  update_leq_(R[idx]);
864  } else {
865  update_geq_(R[idx]);
866  }
867  }
868 
869  void replace(const t_index idxold, const t_index idxnew, const t_float val)
870  {
871  R[idxnew] = R[idxold];
872  I[R[idxnew]] = idxnew;
873  if (val <= A[idxold])
874  update_leq(idxnew, val);
875  else
876  update_geq(idxnew, val);
877  }
878 
879  void update(const t_index idx, const t_float val) const
880  {
881  // Update the element A[i] with val and re-arrange the indices to preserve
882  // the heap condition.
883  if (val <= A[idx])
884  update_leq(idx, val);
885  else
886  update_geq(idx, val);
887  }
888 
889  void update_leq(const t_index idx, const t_float val) const
890  {
891  // Use this when the new value is not more than the old value.
892  A[idx] = val;
893  update_leq_(R[idx]);
894  }
895 
896  void update_geq(const t_index idx, const t_float val) const
897  {
898  // Use this when the new value is not less than the old value.
899  A[idx] = val;
900  update_geq_(R[idx]);
901  }
902 
903 private:
904  void update_leq_(t_index i) const
905  {
906  t_index j;
907  for (; (i > 0) && (H(i) < H(j = (i - 1) >> 1)); i = j)
908  heap_swap(i, j);
909  }
910 
911  void update_geq_(t_index i) const
912  {
913  t_index j;
914  for (; (j = 2 * i + 1) < size; i = j) {
915  if (H(j) >= H(i)) {
916  ++j;
917  if (j >= size || H(j) >= H(i))
918  break;
919  } else if (j + 1 < size && H(j + 1) < H(j))
920  ++j;
921  heap_swap(i, j);
922  }
923  }
924 
925  void heap_swap(const t_index i, const t_index j) const
926  {
927  // Swap two indices.
928  t_index tmp = I[i];
929  I[i] = I[j];
930  I[j] = tmp;
931  R[I[i]] = i;
932  R[I[j]] = j;
933  }
934 
935  inline t_float H(const t_index i) const { return A[I[i]]; }
936 };
937 
938 template <method_codes method, typename t_members>
939 static void generic_linkage(const t_index N, t_float *const D, t_members *const members, cluster_result &Z2)
940 {
941  /*
942  N: integer, number of data points
943  D: condensed distance matrix N*(N-1)/2
944  Z2: output data structure
945  */
946 
947  const t_index N_1 = N - 1;
948  t_index i, j; // loop variables
949  t_index idx1, idx2; // row and column indices
950 
951  auto_array_ptr<t_index> n_nghbr(N_1); // array of nearest neighbors
952  auto_array_ptr<t_float> mindist(N_1); // distances to the nearest neighbors
953  auto_array_ptr<t_index> row_repr(N); // row_repr[i]: node number that the
954  // i-th row represents
955  doubly_linked_list active_nodes(N);
956  binary_min_heap nn_distances(&*mindist, N_1); // minimum heap structure for
957  // the distance to the nearest neighbor of each point
958  t_index node1, node2; // node numbers in the output
959  t_float size1, size2; // and their cardinalities
960 
961  t_float min; // minimum and row index for nearest-neighbor search
962  t_index idx;
963 
964  for (i = 0; i < N; ++i)
965  // Build a list of row ↔ node label assignments.
966  // Initially i ↦ i
967  row_repr[i] = i;
968 
969  // Initialize the minimal distances:
970  // Find the nearest neighbor of each point.
971  // n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1)
972  t_float const *DD = D;
973  for (i = 0; i < N_1; ++i) {
974  min = std::numeric_limits<t_float>::infinity();
975  for (idx = j = i + 1; j < N; ++j, ++DD) {
976 #if HAVE_DIAGNOSTIC
977 #pragma GCC diagnostic push
978 #pragma GCC diagnostic ignored "-Wfloat-equal"
979 #endif
980  if (*DD < min) {
981  min = *DD;
982  idx = j;
983  } else if (fc_isnan(*DD))
984  throw(nan_error());
985  }
986 #if HAVE_DIAGNOSTIC
987 #pragma GCC diagnostic pop
988 #endif
989  mindist[i] = min;
990  n_nghbr[i] = idx;
991  }
992 
993  // Put the minimal distances into a heap structure to make the repeated
994  // global minimum searches fast.
995  nn_distances.heapify();
996 
997 #ifdef FE_INVALID
998  if (feclearexcept(FE_INVALID))
999  throw fenv_error();
1000 #endif
1001 
1002  // Main loop: We have N-1 merging steps.
1003  for (i = 0; i < N_1; ++i) {
1004  /*
1005  Here is a special feature that allows fast bookkeeping and updates of the
1006  minimal distances.
1007 
1008  mindist[i] stores a lower bound on the minimum distance of the point i to
1009  all points of higher index:
1010 
1011  mindist[i] ≥ min_{j>i} D(i,j)
1012 
1013  Normally, we have equality. However, this minimum may become invalid due
1014  to the updates in the distance matrix. The rules are:
1015 
1016  1) If mindist[i] is equal to D(i, n_nghbr[i]), this is the correct
1017  minimum and n_nghbr[i] is a nearest neighbor.
1018 
1019  2) If mindist[i] is smaller than D(i, n_nghbr[i]), this might not be the
1020  correct minimum. The minimum needs to be recomputed.
1021 
1022  3) mindist[i] is never bigger than the true minimum. Hence, we never
1023  miss the true minimum if we take the smallest mindist entry,
1024  re-compute the value if necessary (thus maybe increasing it) and
1025  looking for the now smallest mindist entry until a valid minimal
1026  entry is found. This step is done in the lines below.
1027 
1028  The update process for D below takes care that these rules are
1029  fulfilled. This makes sure that the minima in the rows D(i,i+1:)of D are
1030  re-calculated when necessary but re-calculation is avoided whenever
1031  possible.
1032 
1033  The re-calculation of the minima makes the worst-case runtime of this
1034  algorithm cubic in N. We avoid this whenever possible, and in most cases
1035  the runtime appears to be quadratic.
1036  */
1037  idx1 = nn_distances.argmin();
1038  if (method != METHOD_METR_SINGLE) {
1039  while (mindist[idx1] < D_(idx1, n_nghbr[idx1])) {
1040  // Recompute the minimum mindist[idx1] and n_nghbr[idx1].
1041  n_nghbr[idx1] = j = active_nodes.succ[idx1]; // exists, maximally N-1
1042  min = D_(idx1, j);
1043  for (j = active_nodes.succ[j]; j < N; j = active_nodes.succ[j]) {
1044  if (D_(idx1, j) < min) {
1045  min = D_(idx1, j);
1046  n_nghbr[idx1] = j;
1047  }
1048  }
1049  /* Update the heap with the new true minimum and search for the
1050  (possibly different) minimal entry. */
1051  nn_distances.update_geq(idx1, min);
1052  idx1 = nn_distances.argmin();
1053  }
1054  }
1055 
1056  nn_distances.heap_pop(); // Remove the current minimum from the heap.
1057  idx2 = n_nghbr[idx1];
1058 
1059  // Write the newly found minimal pair of nodes to the output array.
1060  node1 = row_repr[idx1];
1061  node2 = row_repr[idx2];
1062 
1063  if (method == METHOD_METR_AVERAGE || method == METHOD_METR_WARD || method == METHOD_METR_CENTROID) {
1064  size1 = static_cast<t_float>(members[idx1]);
1065  size2 = static_cast<t_float>(members[idx2]);
1066  members[idx2] += members[idx1];
1067  }
1068  Z2.append(node1, node2, mindist[idx1]);
1069 
1070  // Remove idx1 from the list of active indices (active_nodes).
1071  active_nodes.remove(idx1);
1072  // Index idx2 now represents the new (merged) node with label N+i.
1073  row_repr[idx2] = N + i;
1074 
1075  // Update the distance matrix
1076  switch (method) {
1077  case METHOD_METR_SINGLE:
1078  /*
1079  Single linkage.
1080 
1081  Characteristic: new distances are never longer than the old distances.
1082  */
1083  // Update the distance matrix in the range [start, idx1).
1084  for (j = active_nodes.start; j < idx1; j = active_nodes.succ[j]) {
1085  f_single(&D_(j, idx2), D_(j, idx1));
1086  if (n_nghbr[j] == idx1)
1087  n_nghbr[j] = idx2;
1088  }
1089  // Update the distance matrix in the range (idx1, idx2).
1090  for (; j < idx2; j = active_nodes.succ[j]) {
1091  f_single(&D_(j, idx2), D_(idx1, j));
1092  // If the new value is below the old minimum in a row, update
1093  // the mindist and n_nghbr arrays.
1094  if (D_(j, idx2) < mindist[j]) {
1095  nn_distances.update_leq(j, D_(j, idx2));
1096  n_nghbr[j] = idx2;
1097  }
1098  }
1099  // Update the distance matrix in the range (idx2, N).
1100  // Recompute the minimum mindist[idx2] and n_nghbr[idx2].
1101  if (idx2 < N_1) {
1102  min = mindist[idx2];
1103  for (j = active_nodes.succ[idx2]; j < N; j = active_nodes.succ[j]) {
1104  f_single(&D_(idx2, j), D_(idx1, j));
1105  if (D_(idx2, j) < min) {
1106  n_nghbr[idx2] = j;
1107  min = D_(idx2, j);
1108  }
1109  }
1110  nn_distances.update_leq(idx2, min);
1111  }
1112  break;
1113 
1114  case METHOD_METR_COMPLETE:
1115  /*
1116  Complete linkage.
1117 
1118  Characteristic: new distances are never shorter than the old distances.
1119  */
1120  // Update the distance matrix in the range [start, idx1).
1121  for (j = active_nodes.start; j < idx1; j = active_nodes.succ[j]) {
1122  f_complete(&D_(j, idx2), D_(j, idx1));
1123  if (n_nghbr[j] == idx1)
1124  n_nghbr[j] = idx2;
1125  }
1126  // Update the distance matrix in the range (idx1, idx2).
1127  for (; j < idx2; j = active_nodes.succ[j])
1128  f_complete(&D_(j, idx2), D_(idx1, j));
1129  // Update the distance matrix in the range (idx2, N).
1130  for (j = active_nodes.succ[idx2]; j < N; j = active_nodes.succ[j])
1131  f_complete(&D_(idx2, j), D_(idx1, j));
1132  break;
1133 
1134  case METHOD_METR_AVERAGE: {
1135  /*
1136  Average linkage.
1137 
1138  Shorter and longer distances can occur.
1139  */
1140  // Update the distance matrix in the range [start, idx1).
1141  t_float s = size1 / (size1 + size2);
1142  t_float t = size2 / (size1 + size2);
1143  for (j = active_nodes.start; j < idx1; j = active_nodes.succ[j]) {
1144  f_average(&D_(j, idx2), D_(j, idx1), s, t);
1145  if (n_nghbr[j] == idx1)
1146  n_nghbr[j] = idx2;
1147  }
1148  // Update the distance matrix in the range (idx1, idx2).
1149  for (; j < idx2; j = active_nodes.succ[j]) {
1150  f_average(&D_(j, idx2), D_(idx1, j), s, t);
1151  if (D_(j, idx2) < mindist[j]) {
1152  nn_distances.update_leq(j, D_(j, idx2));
1153  n_nghbr[j] = idx2;
1154  }
1155  }
1156  // Update the distance matrix in the range (idx2, N).
1157  if (idx2 < N_1) {
1158  n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1159  f_average(&D_(idx2, j), D_(idx1, j), s, t);
1160  min = D_(idx2, j);
1161  for (j = active_nodes.succ[j]; j < N; j = active_nodes.succ[j]) {
1162  f_average(&D_(idx2, j), D_(idx1, j), s, t);
1163  if (D_(idx2, j) < min) {
1164  min = D_(idx2, j);
1165  n_nghbr[idx2] = j;
1166  }
1167  }
1168  nn_distances.update(idx2, min);
1169  }
1170  break;
1171  }
1172 
1173  case METHOD_METR_WEIGHTED:
1174  /*
1175  Weighted linkage.
1176 
1177  Shorter and longer distances can occur.
1178  */
1179  // Update the distance matrix in the range [start, idx1).
1180  for (j = active_nodes.start; j < idx1; j = active_nodes.succ[j]) {
1181  f_weighted(&D_(j, idx2), D_(j, idx1));
1182  if (n_nghbr[j] == idx1)
1183  n_nghbr[j] = idx2;
1184  }
1185  // Update the distance matrix in the range (idx1, idx2).
1186  for (; j < idx2; j = active_nodes.succ[j]) {
1187  f_weighted(&D_(j, idx2), D_(idx1, j));
1188  if (D_(j, idx2) < mindist[j]) {
1189  nn_distances.update_leq(j, D_(j, idx2));
1190  n_nghbr[j] = idx2;
1191  }
1192  }
1193  // Update the distance matrix in the range (idx2, N).
1194  if (idx2 < N_1) {
1195  n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1196  f_weighted(&D_(idx2, j), D_(idx1, j));
1197  min = D_(idx2, j);
1198  for (j = active_nodes.succ[j]; j < N; j = active_nodes.succ[j]) {
1199  f_weighted(&D_(idx2, j), D_(idx1, j));
1200  if (D_(idx2, j) < min) {
1201  min = D_(idx2, j);
1202  n_nghbr[idx2] = j;
1203  }
1204  }
1205  nn_distances.update(idx2, min);
1206  }
1207  break;
1208 
1209  case METHOD_METR_WARD:
1210  /*
1211  Ward linkage.
1212 
1213  Shorter and longer distances can occur, not smaller than min(d1,d2)
1214  but maybe bigger than max(d1,d2).
1215  */
1216  // Update the distance matrix in the range [start, idx1).
1217  for (j = active_nodes.start; j < idx1; j = active_nodes.succ[j]) {
1218  f_ward(&D_(j, idx2), D_(j, idx1), mindist[idx1], size1, size2, static_cast<t_float>(members[j]));
1219  if (n_nghbr[j] == idx1)
1220  n_nghbr[j] = idx2;
1221  }
1222  // Update the distance matrix in the range (idx1, idx2).
1223  for (; j < idx2; j = active_nodes.succ[j]) {
1224  f_ward(&D_(j, idx2), D_(idx1, j), mindist[idx1], size1, size2, static_cast<t_float>(members[j]));
1225  if (D_(j, idx2) < mindist[j]) {
1226  nn_distances.update_leq(j, D_(j, idx2));
1227  n_nghbr[j] = idx2;
1228  }
1229  }
1230  // Update the distance matrix in the range (idx2, N).
1231  if (idx2 < N_1) {
1232  n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1233  f_ward(&D_(idx2, j), D_(idx1, j), mindist[idx1], size1, size2, static_cast<t_float>(members[j]));
1234  min = D_(idx2, j);
1235  for (j = active_nodes.succ[j]; j < N; j = active_nodes.succ[j]) {
1236  f_ward(&D_(idx2, j), D_(idx1, j), mindist[idx1], size1, size2, static_cast<t_float>(members[j]));
1237  if (D_(idx2, j) < min) {
1238  min = D_(idx2, j);
1239  n_nghbr[idx2] = j;
1240  }
1241  }
1242  nn_distances.update(idx2, min);
1243  }
1244  break;
1245 
1246  case METHOD_METR_CENTROID: {
1247  /*
1248  Centroid linkage.
1249 
1250  Shorter and longer distances can occur, not bigger than max(d1,d2)
1251  but maybe smaller than min(d1,d2).
1252  */
1253  // Update the distance matrix in the range [start, idx1).
1254  t_float s = size1 / (size1 + size2);
1255  t_float t = size2 / (size1 + size2);
1256  t_float stc = s * t * mindist[idx1];
1257  for (j = active_nodes.start; j < idx1; j = active_nodes.succ[j]) {
1258  f_centroid(&D_(j, idx2), D_(j, idx1), stc, s, t);
1259  if (D_(j, idx2) < mindist[j]) {
1260  nn_distances.update_leq(j, D_(j, idx2));
1261  n_nghbr[j] = idx2;
1262  } else if (n_nghbr[j] == idx1)
1263  n_nghbr[j] = idx2;
1264  }
1265  // Update the distance matrix in the range (idx1, idx2).
1266  for (; j < idx2; j = active_nodes.succ[j]) {
1267  f_centroid(&D_(j, idx2), D_(idx1, j), stc, s, t);
1268  if (D_(j, idx2) < mindist[j]) {
1269  nn_distances.update_leq(j, D_(j, idx2));
1270  n_nghbr[j] = idx2;
1271  }
1272  }
1273  // Update the distance matrix in the range (idx2, N).
1274  if (idx2 < N_1) {
1275  n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1276  f_centroid(&D_(idx2, j), D_(idx1, j), stc, s, t);
1277  min = D_(idx2, j);
1278  for (j = active_nodes.succ[j]; j < N; j = active_nodes.succ[j]) {
1279  f_centroid(&D_(idx2, j), D_(idx1, j), stc, s, t);
1280  if (D_(idx2, j) < min) {
1281  min = D_(idx2, j);
1282  n_nghbr[idx2] = j;
1283  }
1284  }
1285  nn_distances.update(idx2, min);
1286  }
1287  break;
1288  }
1289 
1290  case METHOD_METR_MEDIAN: {
1291  /*
1292  Median linkage.
1293 
1294  Shorter and longer distances can occur, not bigger than max(d1,d2)
1295  but maybe smaller than min(d1,d2).
1296  */
1297  // Update the distance matrix in the range [start, idx1).
1298  t_float c_4 = mindist[idx1] * .25;
1299  for (j = active_nodes.start; j < idx1; j = active_nodes.succ[j]) {
1300  f_median(&D_(j, idx2), D_(j, idx1), c_4);
1301  if (D_(j, idx2) < mindist[j]) {
1302  nn_distances.update_leq(j, D_(j, idx2));
1303  n_nghbr[j] = idx2;
1304  } else if (n_nghbr[j] == idx1)
1305  n_nghbr[j] = idx2;
1306  }
1307  // Update the distance matrix in the range (idx1, idx2).
1308  for (; j < idx2; j = active_nodes.succ[j]) {
1309  f_median(&D_(j, idx2), D_(idx1, j), c_4);
1310  if (D_(j, idx2) < mindist[j]) {
1311  nn_distances.update_leq(j, D_(j, idx2));
1312  n_nghbr[j] = idx2;
1313  }
1314  }
1315  // Update the distance matrix in the range (idx2, N).
1316  if (idx2 < N_1) {
1317  n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1318  f_median(&D_(idx2, j), D_(idx1, j), c_4);
1319  min = D_(idx2, j);
1320  for (j = active_nodes.succ[j]; j < N; j = active_nodes.succ[j]) {
1321  f_median(&D_(idx2, j), D_(idx1, j), c_4);
1322  if (D_(idx2, j) < min) {
1323  min = D_(idx2, j);
1324  n_nghbr[idx2] = j;
1325  }
1326  }
1327  nn_distances.update(idx2, min);
1328  }
1329  break;
1330  }
1331 
1332  default: throw std::runtime_error(std::string("Invalid method."));
1333  }
1334  }
1335 #ifdef FE_INVALID
1336  if (fetestexcept(FE_INVALID))
1337  throw fenv_error();
1338 #endif
1339 }
1340 
1341 /*
1342  Clustering methods for vector data
1343 */
1344 
1345 template <typename t_dissimilarity>
1346 static void MST_linkage_core_vector(const t_index N, t_dissimilarity &dist, cluster_result &Z2)
1347 {
1348  /*
1349  N: integer, number of data points
1350  dist: function pointer to the metric
1351  Z2: output data structure
1352 
1353  The basis of this algorithm is an algorithm by Rohlf:
1354 
1355  F. James Rohlf, Hierarchical clustering using the minimum spanning tree,
1356  The Computer Journal, vol. 16, 1973, p. 93–95.
1357  */
1358  t_index i;
1359  t_index idx2;
1360  doubly_linked_list active_nodes(N);
1362 
1363  t_index prev_node;
1364  t_float min;
1365 
1366  // first iteration
1367  idx2 = 1;
1368  min = std::numeric_limits<t_float>::infinity();
1369  for (i = 1; i < N; ++i) {
1370  d[i] = dist(0, i);
1371 #if HAVE_DIAGNOSTIC
1372 #pragma GCC diagnostic push
1373 #pragma GCC diagnostic ignored "-Wfloat-equal"
1374 #endif
1375  if (d[i] < min) {
1376  min = d[i];
1377  idx2 = i;
1378  } else if (fc_isnan(d[i]))
1379  throw(nan_error());
1380 #if HAVE_DIAGNOSTIC
1381 #pragma GCC diagnostic pop
1382 #endif
1383  }
1384 
1385  Z2.append(0, idx2, min);
1386 
1387  for (t_index j = 1; j < N - 1; ++j) {
1388  prev_node = idx2;
1389  active_nodes.remove(prev_node);
1390 
1391  idx2 = active_nodes.succ[0];
1392  min = d[idx2];
1393 
1394  for (i = idx2; i < N; i = active_nodes.succ[i]) {
1395  t_float tmp = dist(i, prev_node);
1396 #if HAVE_DIAGNOSTIC
1397 #pragma GCC diagnostic push
1398 #pragma GCC diagnostic ignored "-Wfloat-equal"
1399 #endif
1400  if (d[i] > tmp)
1401  d[i] = tmp;
1402  else if (fc_isnan(tmp))
1403  throw(nan_error());
1404 #if HAVE_DIAGNOSTIC
1405 #pragma GCC diagnostic pop
1406 #endif
1407  if (d[i] < min) {
1408  min = d[i];
1409  idx2 = i;
1410  }
1411  }
1412  Z2.append(prev_node, idx2, min);
1413  }
1414 }
1415 
1416 template <method_codes_vector method, typename t_dissimilarity>
1417 static void generic_linkage_vector(const t_index N, t_dissimilarity &dist, cluster_result &Z2)
1418 {
1419  /*
1420  N: integer, number of data points
1421  dist: function pointer to the metric
1422  Z2: output data structure
1423 
1424  This algorithm is valid for the distance update methods
1425  "Ward", "centroid" and "median" only!
1426  */
1427  const t_index N_1 = N - 1;
1428  t_index i, j; // loop variables
1429  t_index idx1, idx2; // row and column indices
1430 
1431  auto_array_ptr<t_index> n_nghbr(N_1); // array of nearest neighbors
1432  auto_array_ptr<t_float> mindist(N_1); // distances to the nearest neighbors
1433  auto_array_ptr<t_index> row_repr(N); // row_repr[i]: node number that the
1434  // i-th row represents
1435  doubly_linked_list active_nodes(N);
1436  binary_min_heap nn_distances(&*mindist, N_1); // minimum heap structure for
1437  // the distance to the nearest neighbor of each point
1438  t_index node1, node2; // node numbers in the output
1439  t_float min; // minimum and row index for nearest-neighbor search
1440 
1441  for (i = 0; i < N; ++i)
1442  // Build a list of row ↔ node label assignments.
1443  // Initially i ↦ i
1444  row_repr[i] = i;
1445 
1446  // Initialize the minimal distances:
1447  // Find the nearest neighbor of each point.
1448  // n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1)
1449  for (i = 0; i < N_1; ++i) {
1450  min = std::numeric_limits<t_float>::infinity();
1451  t_index idx;
1452  for (idx = j = i + 1; j < N; ++j) {
1453  t_float tmp;
1454  switch (method) {
1455  case METHOD_VECTOR_WARD: tmp = dist.ward_initial(i, j); break;
1456  default: tmp = dist.template sqeuclidean<true>(i, j);
1457  }
1458  if (tmp < min) {
1459  min = tmp;
1460  idx = j;
1461  }
1462  }
1463  switch (method) {
1464  case METHOD_VECTOR_WARD: mindist[i] = t_dissimilarity::ward_initial_conversion(min); break;
1465  default: mindist[i] = min;
1466  }
1467  n_nghbr[i] = idx;
1468  }
1469 
1470  // Put the minimal distances into a heap structure to make the repeated
1471  // global minimum searches fast.
1472  nn_distances.heapify();
1473 
1474  // Main loop: We have N-1 merging steps.
1475  for (i = 0; i < N_1; ++i) {
1476  idx1 = nn_distances.argmin();
1477 
1478  while (active_nodes.is_inactive(n_nghbr[idx1])) {
1479  // Recompute the minimum mindist[idx1] and n_nghbr[idx1].
1480  n_nghbr[idx1] = j = active_nodes.succ[idx1]; // exists, maximally N-1
1481  switch (method) {
1482  case METHOD_VECTOR_WARD:
1483  min = dist.ward(idx1, j);
1484  for (j = active_nodes.succ[j]; j < N; j = active_nodes.succ[j]) {
1485  t_float const tmp = dist.ward(idx1, j);
1486  if (tmp < min) {
1487  min = tmp;
1488  n_nghbr[idx1] = j;
1489  }
1490  }
1491  break;
1492  default:
1493  min = dist.template sqeuclidean<true>(idx1, j);
1494  for (j = active_nodes.succ[j]; j < N; j = active_nodes.succ[j]) {
1495  t_float const tmp = dist.template sqeuclidean<true>(idx1, j);
1496  if (tmp < min) {
1497  min = tmp;
1498  n_nghbr[idx1] = j;
1499  }
1500  }
1501  }
1502  /* Update the heap with the new true minimum and search for the (possibly
1503  different) minimal entry. */
1504  nn_distances.update_geq(idx1, min);
1505  idx1 = nn_distances.argmin();
1506  }
1507 
1508  nn_distances.heap_pop(); // Remove the current minimum from the heap.
1509  idx2 = n_nghbr[idx1];
1510 
1511  // Write the newly found minimal pair of nodes to the output array.
1512  node1 = row_repr[idx1];
1513  node2 = row_repr[idx2];
1514 
1515  Z2.append(node1, node2, mindist[idx1]);
1516 
1517  switch (method) {
1518  case METHOD_VECTOR_WARD:
1519  case METHOD_VECTOR_CENTROID: dist.merge_inplace(idx1, idx2); break;
1520  case METHOD_VECTOR_MEDIAN: dist.merge_inplace_weighted(idx1, idx2); break;
1521  default: throw std::runtime_error(std::string("Invalid method."));
1522  }
1523 
1524  // Index idx2 now represents the new (merged) node with label N+i.
1525  row_repr[idx2] = N + i;
1526  // Remove idx1 from the list of active indices (active_nodes).
1527  active_nodes.remove(idx1); // TBD later!!!
1528 
1529  // Update the distance matrix
1530  switch (method) {
1531  case METHOD_VECTOR_WARD:
1532  /*
1533  Ward linkage.
1534 
1535  Shorter and longer distances can occur, not smaller than min(d1,d2)
1536  but maybe bigger than max(d1,d2).
1537  */
1538  // Update the distance matrix in the range [start, idx1).
1539  for (j = active_nodes.start; j < idx1; j = active_nodes.succ[j]) {
1540  if (n_nghbr[j] == idx2) {
1541  n_nghbr[j] = idx1; // invalidate
1542  }
1543  }
1544  // Update the distance matrix in the range (idx1, idx2).
1545  for (; j < idx2; j = active_nodes.succ[j]) {
1546  t_float const tmp = dist.ward(j, idx2);
1547  if (tmp < mindist[j]) {
1548  nn_distances.update_leq(j, tmp);
1549  n_nghbr[j] = idx2;
1550  } else if (n_nghbr[j] == idx2) {
1551  n_nghbr[j] = idx1; // invalidate
1552  }
1553  }
1554  // Find the nearest neighbor for idx2.
1555  if (idx2 < N_1) {
1556  n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1557  min = dist.ward(idx2, j);
1558  for (j = active_nodes.succ[j]; j < N; j = active_nodes.succ[j]) {
1559  t_float const tmp = dist.ward(idx2, j);
1560  if (tmp < min) {
1561  min = tmp;
1562  n_nghbr[idx2] = j;
1563  }
1564  }
1565  nn_distances.update(idx2, min);
1566  }
1567  break;
1568 
1569  default:
1570  /*
1571  Centroid and median linkage.
1572 
1573  Shorter and longer distances can occur, not bigger than max(d1,d2)
1574  but maybe smaller than min(d1,d2).
1575  */
1576  for (j = active_nodes.start; j < idx2; j = active_nodes.succ[j]) {
1577  t_float const tmp = dist.template sqeuclidean<true>(j, idx2);
1578  if (tmp < mindist[j]) {
1579  nn_distances.update_leq(j, tmp);
1580  n_nghbr[j] = idx2;
1581  } else if (n_nghbr[j] == idx2)
1582  n_nghbr[j] = idx1; // invalidate
1583  }
1584  // Find the nearest neighbor for idx2.
1585  if (idx2 < N_1) {
1586  n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1587  min = dist.template sqeuclidean<true>(idx2, j);
1588  for (j = active_nodes.succ[j]; j < N; j = active_nodes.succ[j]) {
1589  t_float const tmp = dist.template sqeuclidean<true>(idx2, j);
1590  if (tmp < min) {
1591  min = tmp;
1592  n_nghbr[idx2] = j;
1593  }
1594  }
1595  nn_distances.update(idx2, min);
1596  }
1597  }
1598  }
1599 }
1600 
1601 template <method_codes_vector method, typename t_dissimilarity>
1602 static void generic_linkage_vector_alternative(const t_index N, t_dissimilarity &dist, cluster_result &Z2)
1603 {
1604  /*
1605  N: integer, number of data points
1606  dist: function pointer to the metric
1607  Z2: output data structure
1608 
1609  This algorithm is valid for the distance update methods
1610  "Ward", "centroid" and "median" only!
1611  */
1612  const t_index N_1 = N - 1;
1613  t_index i, j = 0; // loop variables
1614  t_index idx1, idx2; // row and column indices
1615 
1616  auto_array_ptr<t_index> n_nghbr(2 * N - 2); // array of nearest neighbors
1617  auto_array_ptr<t_float> mindist(2 * N - 2); // distances to the nearest neighbors
1618 
1619  doubly_linked_list active_nodes(N + N_1);
1620  binary_min_heap nn_distances(&*mindist, N_1, 2 * N - 2,
1621  1); // minimum heap
1622  // structure for the distance to the nearest neighbor of each point
1623 
1624  t_float min; // minimum for nearest-neighbor searches
1625 
1626  // Initialize the minimal distances:
1627  // Find the nearest neighbor of each point.
1628  // n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1)
1629  for (i = 1; i < N; ++i) {
1630  min = std::numeric_limits<t_float>::infinity();
1631  t_index idx;
1632  for (idx = j = 0; j < i; ++j) {
1633  t_float tmp;
1634  switch (method) {
1635  case METHOD_VECTOR_WARD: tmp = dist.ward_initial(i, j); break;
1636  default: tmp = dist.template sqeuclidean<true>(i, j);
1637  }
1638  if (tmp < min) {
1639  min = tmp;
1640  idx = j;
1641  }
1642  }
1643  switch (method) {
1644  case METHOD_VECTOR_WARD: mindist[i] = t_dissimilarity::ward_initial_conversion(min); break;
1645  default: mindist[i] = min;
1646  }
1647  n_nghbr[i] = idx;
1648  }
1649 
1650  // Put the minimal distances into a heap structure to make the repeated
1651  // global minimum searches fast.
1652  nn_distances.heapify();
1653 
1654  // Main loop: We have N-1 merging steps.
1655  for (i = N; i < N + N_1; ++i) {
1656  /*
1657  The bookkeeping is different from the "stored matrix approach" algorithm
1658  generic_linkage.
1659 
1660  mindist[i] stores a lower bound on the minimum distance of the point i to
1661  all points of *lower* index:
1662 
1663  mindist[i] ≥ min_{j<i} D(i,j)
1664 
1665  Moreover, new nodes do not re-use one of the old indices, but they are
1666  given a new, unique index (SciPy convention: initial nodes are 0,…,N−1,
1667  new nodes are N,…,2N−2).
1668 
1669  Invalid nearest neighbors are not recognized by the fact that the stored
1670  distance is smaller than the actual distance, but the list active_nodes
1671  maintains a flag whether a node is inactive. If n_nghbr[i] points to an
1672  active node, the entries nn_distances[i] and n_nghbr[i] are valid,
1673  otherwise they must be recomputed.
1674  */
1675  idx1 = nn_distances.argmin();
1676  while (active_nodes.is_inactive(n_nghbr[idx1])) {
1677  // Recompute the minimum mindist[idx1] and n_nghbr[idx1].
1678  n_nghbr[idx1] = j = active_nodes.start;
1679  switch (method) {
1680  case METHOD_VECTOR_WARD:
1681  min = dist.ward_extended(idx1, j);
1682  for (j = active_nodes.succ[j]; j < idx1; j = active_nodes.succ[j]) {
1683  t_float tmp = dist.ward_extended(idx1, j);
1684  if (tmp < min) {
1685  min = tmp;
1686  n_nghbr[idx1] = j;
1687  }
1688  }
1689  break;
1690  default:
1691  min = dist.sqeuclidean_extended(idx1, j);
1692  for (j = active_nodes.succ[j]; j < idx1; j = active_nodes.succ[j]) {
1693  t_float const tmp = dist.sqeuclidean_extended(idx1, j);
1694  if (tmp < min) {
1695  min = tmp;
1696  n_nghbr[idx1] = j;
1697  }
1698  }
1699  }
1700  /* Update the heap with the new true minimum and search for the (possibly
1701  different) minimal entry. */
1702  nn_distances.update_geq(idx1, min);
1703  idx1 = nn_distances.argmin();
1704  }
1705 
1706  idx2 = n_nghbr[idx1];
1707  active_nodes.remove(idx1);
1708  active_nodes.remove(idx2);
1709 
1710  Z2.append(idx1, idx2, mindist[idx1]);
1711 
1712  if (i < 2 * N_1) {
1713  switch (method) {
1714  case METHOD_VECTOR_WARD:
1715  case METHOD_VECTOR_CENTROID: dist.merge(idx1, idx2, i); break;
1716 
1717  case METHOD_VECTOR_MEDIAN: dist.merge_weighted(idx1, idx2, i); break;
1718 
1719  default: throw std::runtime_error(std::string("Invalid method."));
1720  }
1721 
1722  n_nghbr[i] = active_nodes.start;
1723  if (method == METHOD_VECTOR_WARD) {
1724  /*
1725  Ward linkage.
1726 
1727  Shorter and longer distances can occur, not smaller than min(d1,d2)
1728  but maybe bigger than max(d1,d2).
1729  */
1730  min = dist.ward_extended(active_nodes.start, i);
1731  for (j = active_nodes.succ[active_nodes.start]; j < i; j = active_nodes.succ[j]) {
1732  t_float tmp = dist.ward_extended(j, i);
1733  if (tmp < min) {
1734  min = tmp;
1735  n_nghbr[i] = j;
1736  }
1737  }
1738  } else {
1739  /*
1740  Centroid and median linkage.
1741 
1742  Shorter and longer distances can occur, not bigger than max(d1,d2)
1743  but maybe smaller than min(d1,d2).
1744  */
1745  min = dist.sqeuclidean_extended(active_nodes.start, i);
1746  for (j = active_nodes.succ[active_nodes.start]; j < i; j = active_nodes.succ[j]) {
1747  t_float tmp = dist.sqeuclidean_extended(j, i);
1748  if (tmp < min) {
1749  min = tmp;
1750  n_nghbr[i] = j;
1751  }
1752  }
1753  }
1754  if (idx2 < active_nodes.start) {
1755  nn_distances.remove(active_nodes.start);
1756  } else {
1757  nn_distances.remove(idx2);
1758  }
1759  nn_distances.replace(idx1, i, min);
1760  }
1761  }
1762 }
1763 
1764 #if HAVE_VISIBILITY
1765 #pragma GCC visibility pop
1766 #endif
auto_array_ptr
Definition: fastcluster_dm.cxx:177
METHOD_VECTOR_SINGLE
@ METHOD_VECTOR_SINGLE
Definition: fastcluster_dm.cxx:166
node::dist
t_float dist
Definition: fastcluster_dm.cxx:215
MIN_METHOD_VECTOR_CODE
@ MIN_METHOD_VECTOR_CODE
Definition: fastcluster_dm.cxx:171
auto_array_ptr::init
void init(index const size)
Definition: fastcluster_dm.cxx:200
union_find::Find
t_index Find(t_index idx) const
Definition: fastcluster_dm.cxx:363
cluster_result::append
void append(const t_index node1, const t_index node2, const t_float dist)
Definition: fastcluster_dm.cxx:231
cluster_result::operator[]
node * operator[](const t_index idx) const
Definition: fastcluster_dm.cxx:239
binary_min_heap::update_leq
void update_leq(const t_index idx, const t_float val) const
Definition: fastcluster_dm.cxx:889
cluster_result::power
void power(const t_float p) const
Definition: fastcluster_dm.cxx:269
method_codes
method_codes
Definition: fastcluster_dm.cxx:148
nan_error
Definition: fastcluster_dm.cxx:386
operator<
bool operator<(const node a, const node b)
Definition: fastcluster_dm.cxx:218
doubly_linked_list::~doubly_linked_list
~doubly_linked_list()
Definition: fastcluster_dm.cxx:323
METHOD_METR_SINGLE
@ METHOD_METR_SINGLE
Definition: fastcluster_dm.cxx:150
union_find::Union
void Union(const t_index node1, const t_index node2)
Definition: fastcluster_dm.cxx:383
binary_min_heap::remove
void remove(t_index idx)
Definition: fastcluster_dm.cxx:856
auto_array_ptr::init
void init(index const size, value const val)
Definition: fastcluster_dm.cxx:205
node
Definition: fastcluster_dm.cxx:213
fc_isnan
bool fc_isnan(double x)
Definition: fastcluster.cxx:17
doubly_linked_list::start
t_index start
Definition: fastcluster_dm.cxx:303
t_index
int_fast32_t t_index
Definition: fastcluster_dm.cxx:129
METHOD_METR_WEIGHTED
@ METHOD_METR_WEIGHTED
Definition: fastcluster_dm.cxx:153
cluster_result
Definition: fastcluster_dm.cxx:223
doubly_linked_list
Definition: fastcluster_dm.cxx:292
MAX_METHOD_CODE
@ MAX_METHOD_CODE
Definition: fastcluster_dm.cxx:161
binary_min_heap::replace
void replace(const t_index idxold, const t_index idxnew, const t_float val)
Definition: fastcluster_dm.cxx:869
METHOD_VECTOR_CENTROID
@ METHOD_VECTOR_CENTROID
Definition: fastcluster_dm.cxx:168
doubly_linked_list::is_inactive
bool is_inactive(t_index idx) const
Definition: fastcluster_dm.cxx:337
binary_min_heap::argmin
t_index argmin() const
Definition: fastcluster_dm.cxx:841
union_find
Definition: fastcluster_dm.cxx:355
cluster_result::sqrt
void sqrt() const
Definition: fastcluster_dm.cxx:244
auto_array_ptr::auto_array_ptr
auto_array_ptr(index const size, value const val)
Definition: fastcluster_dm.cxx:189
binary_min_heap::heapify
void heapify()
Definition: fastcluster_dm.cxx:826
binary_min_heap::binary_min_heap
binary_min_heap(t_float *const A_, const t_index size1, const t_index size2, const t_index start)
Definition: fastcluster_dm.cxx:814
MAX_METHOD_VECTOR_CODE
@ MAX_METHOD_VECTOR_CODE
Definition: fastcluster_dm.cxx:172
METHOD_VECTOR_MEDIAN
@ METHOD_VECTOR_MEDIAN
Definition: fastcluster_dm.cxx:169
METHOD_METR_MEDIAN
@ METHOD_METR_MEDIAN
Definition: fastcluster_dm.cxx:157
doubly_linked_list::doubly_linked_list
doubly_linked_list(const t_index size)
Definition: fastcluster_dm.cxx:311
auto_array_ptr::~auto_array_ptr
~auto_array_ptr()
Definition: fastcluster_dm.cxx:193
cluster_result::divide
void divide(const t_float denom) const
Definition: fastcluster_dm.cxx:284
binary_min_heap::heap_pop
void heap_pop()
Definition: fastcluster_dm.cxx:847
auto_array_ptr::auto_array_ptr
auto_array_ptr(index const size)
Definition: fastcluster_dm.cxx:185
auto_array_ptr::free
void free()
Definition: fastcluster_dm.cxx:194
t_float
double t_float
Definition: fastcluster_dm.cxx:141
METHOD_VECTOR_WARD
@ METHOD_VECTOR_WARD
Definition: fastcluster_dm.cxx:167
METHOD_METR_COMPLETE
@ METHOD_METR_COMPLETE
Definition: fastcluster_dm.cxx:151
method_codes_vector
method_codes_vector
Definition: fastcluster_dm.cxx:164
cluster_result::plusone
void plusone(const t_float) const
Definition: fastcluster_dm.cxx:277
union_find::union_find
union_find(const t_index size)
Definition: fastcluster_dm.cxx:361
cluster_result::sqrt
void sqrt(const t_float) const
Definition: fastcluster_dm.cxx:251
FILL_N
#define FILL_N
Definition: fastcluster_dm.cxx:94
cluster_result::cluster_result
cluster_result(const t_index size)
Definition: fastcluster_dm.cxx:229
node::node1
t_index node1
Definition: fastcluster_dm.cxx:214
binary_min_heap::update_geq
void update_geq(const t_index idx, const t_float val) const
Definition: fastcluster_dm.cxx:896
binary_min_heap::binary_min_heap
binary_min_heap(t_float *const A_, const t_index size_)
Definition: fastcluster_dm.cxx:807
cluster_result::sqrtdouble
void sqrtdouble(const t_float) const
Definition: fastcluster_dm.cxx:256
MIN_METHOD_CODE
@ MIN_METHOD_CODE
Definition: fastcluster_dm.cxx:160
METHOD_METR_CENTROID
@ METHOD_METR_CENTROID
Definition: fastcluster_dm.cxx:156
METHOD_METR_WARD_D
@ METHOD_METR_WARD_D
Definition: fastcluster_dm.cxx:155
node::node2
t_index node2
Definition: fastcluster_dm.cxx:214
size_
#define size_(r_)
Definition: fastcluster_R_dm.cxx:65
METHOD_METR_AVERAGE
@ METHOD_METR_AVERAGE
Definition: fastcluster_dm.cxx:152
doubly_linked_list::succ
auto_array_ptr< t_index > succ
Definition: fastcluster_dm.cxx:304
METHOD_METR_WARD
@ METHOD_METR_WARD
Definition: fastcluster_dm.cxx:154
binary_min_heap::update
void update(const t_index idx, const t_float val) const
Definition: fastcluster_dm.cxx:879
binary_min_heap::~binary_min_heap
~binary_min_heap()
Definition: fastcluster_dm.cxx:824
D_
#define D_(r_, c_)
Definition: fastcluster_dm.cxx:343
METHOD_METR_WARD_D2
@ METHOD_METR_WARD_D2
Definition: fastcluster_dm.cxx:158
binary_min_heap
Definition: fastcluster_dm.cxx:772
my_pow
#define my_pow
Definition: fastcluster_dm.cxx:266
doubly_linked_list::remove
void remove(const t_index idx)
Definition: fastcluster_dm.cxx:325
c
constexpr auto c
Definition: AtRadialChargeModel.cxx:20
auto_array_ptr::auto_array_ptr
auto_array_ptr()
Definition: fastcluster_dm.cxx:183