ATTPCROOT  0.3.0-alpha
A ROOT-based framework for analyzing data from active target detectors
fastcluster_dm.cxx
Go to the documentation of this file.
1 /*
2  fastcluster: Fast hierarchical clustering routines for R and Python
3 
4  Copyright © 2011 Daniel Müllner
5  <http://danifold.net>
6 
7  This library implements various fast algorithms for hierarchical,
8  agglomerative clustering methods:
9 
10  (1) Algorithms for the "stored matrix approach": the input is the array of
11  pairwise dissimilarities.
12 
13  MST_linkage_core: single linkage clustering with the "minimum spanning
14  tree algorithm (Rohlfs)
15 
16  NN_chain_core: nearest-neighbor-chain algorithm, suitable for single,
17  complete, average, weighted and Ward linkage (Murtagh)
18 
19  generic_linkage: generic algorithm, suitable for all distance update
20  formulas (Müllner)
21 
22  (2) Algorithms for the "stored data approach": the input are points in a
23  vector space.
24 
25  MST_linkage_core_vector: single linkage clustering for vector data
26 
27  generic_linkage_vector: generic algorithm for vector data, suitable for
28  the Ward, centroid and median methods.
29 
30  generic_linkage_vector_alternative: alternative scheme for updating the
31  nearest neighbors. This method seems faster than "generic_linkage_vector"
32  for the centroid and median methods but slower for the Ward method.
33 
34  All these implementation treat infinity values correctly. They throw an
35  exception if a NaN distance value occurs.
36 */
37 
38 // Older versions of Microsoft Visual Studio do not have the fenv header.
39 #ifdef _MSC_VER
40 #if (_MSC_VER == 1500 || _MSC_VER == 1600)
41 #define NO_INCLUDE_FENV
42 #endif
43 #endif
44 // NaN detection via fenv might not work on systems with software
45 // floating-point emulation (bug report for Debian armel).
46 #ifdef __SOFTFP__
47 #define NO_INCLUDE_FENV
48 #endif
49 #ifdef NO_INCLUDE_FENV
50 //#pragma message("Do not use fenv header.")
51 #else
52 //#pragma message("Use fenv header. If there is a warning about unknown #pragma STDC FENV_ACCESS, this can be ignored.")
53 //#pragma STDC FENV_ACCESS on
54 #include <fenv.h>
55 #endif
56 
57 #include <algorithm> // for std::fill_n
58 #include <cfloat> // also for DBL_MAX, DBL_MIN
59 #include <cmath> // for std::pow, std::sqrt
60 #include <cstddef> // for std::ptrdiff_t
61 #include <limits> // for std::numeric_limits<...>::infinity()
62 #include <stdexcept> // for std::runtime_error
63 #include <string> // for std::string
64 #ifndef DBL_MANT_DIG
65 #error The constant DBL_MANT_DIG could not be defined.
66 #endif
67 #define T_FLOAT_MANT_DIG DBL_MANT_DIG
68 
69 #ifndef LONG_MAX
70 #include <climits>
71 #endif
72 #ifndef LONG_MAX
73 #error The constant LONG_MAX could not be defined.
74 #endif
75 #ifndef INT_MAX
76 #error The constant INT_MAX could not be defined.
77 #endif
78 
79 #ifndef INT32_MAX
80 #ifdef _MSC_VER
81 #if _MSC_VER >= 1600
82 #define __STDC_LIMIT_MACROS
83 #include <stdint.h>
84 #else
85 typedef __int32 int_fast32_t;
86 typedef __int64 int64_t;
87 #endif
88 #else
89 #define __STDC_LIMIT_MACROS
90 #include <stdint.h>
91 #endif
92 #endif
93 
94 #define FILL_N std::fill_n
95 #ifdef _MSC_VER
96 #if _MSC_VER < 1600
97 #undef FILL_N
98 #define FILL_N stdext::unchecked_fill_n
99 #endif
100 #endif
101 
102 // Suppress warnings about (potentially) uninitialized variables.
103 #ifdef _MSC_VER
104 #pragma warning(disable : 4700)
105 #endif
106 
107 #ifndef HAVE_DIAGNOSTIC
108 #if __GNUC__ > 4 || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 6))
109 #define HAVE_DIAGNOSTIC 1
110 #endif
111 #endif
112 
113 #ifndef HAVE_VISIBILITY
114 #if __GNUC__ >= 4
115 #define HAVE_VISIBILITY 1
116 #endif
117 #endif
118 
119 /* Since the public interface is given by the Python respectively R interface,
120  * we do not want other symbols than the interface initalization routines to be
121  * visible in the shared object file. The "visibility" switch is a GCC concept.
122  * Hiding symbols keeps the relocation table small and decreases startup time.
123  * See http://gcc.gnu.org/wiki/Visibility
124  */
125 #if HAVE_VISIBILITY
126 #pragma GCC visibility push(hidden)
127 #endif
128 
129 typedef int_fast32_t t_index;
130 #ifndef INT32_MAX
131 #define MAX_INDEX 0x7fffffffL
132 #else
133 #define MAX_INDEX INT32_MAX
134 #endif
135 #if (LONG_MAX < MAX_INDEX)
136 #error The integer format "t_index" must not have a greater range than "long int".
137 #endif
138 #if (INT_MAX > MAX_INDEX)
139 #error The integer format "int" must not have a greater range than "t_index".
140 #endif
141 typedef double t_float;
142 
143 /* Method codes.
144 
145  These codes must agree with the METHODS array in fastcluster.R and the
146  dictionary mthidx in fastcluster.py.
147 */
149  // non-Euclidean methods
159 
161  MAX_METHOD_CODE = 7
162 };
163 
165  // Euclidean methods
170 
173 };
174 
175 // self-destructing array pointer
176 template <typename type>
177 class auto_array_ptr {
178 private:
179  type *ptr;
180  auto_array_ptr(auto_array_ptr const &); // non construction-copyable
181  auto_array_ptr &operator=(auto_array_ptr const &); // non copyable
182 public:
183  auto_array_ptr() : ptr(NULL) {}
184  template <typename index>
185  auto_array_ptr(index const size) : ptr(new type[size])
186  {
187  }
188  template <typename index, typename value>
189  auto_array_ptr(index const size, value const val) : ptr(new type[size])
190  {
191  FILL_N(ptr, size, val);
192  }
193  ~auto_array_ptr() { delete[] ptr; }
194  void free()
195  {
196  delete[] ptr;
197  ptr = NULL;
198  }
199  template <typename index>
200  void init(index const size)
201  {
202  ptr = new type[size];
203  }
204  template <typename index, typename value>
205  void init(index const size, value const val)
206  {
207  init(size);
208  FILL_N(ptr, size, val);
209  }
210  inline operator type *() const { return ptr; }
211 };
212 
213 struct node {
215  t_float dist;
216 };
217 
218 inline bool operator<(const node a, const node b)
219 {
220  return (a.dist < b.dist);
221 }
222 
223 class cluster_result {
224 private:
226  t_index pos;
227 
228 public:
229  cluster_result(const t_index size) : Z(size), pos(0) {}
230 
231  void append(const t_index node1, const t_index node2, const t_float dist)
232  {
233  Z[pos].node1 = node1;
234  Z[pos].node2 = node2;
235  Z[pos].dist = dist;
236  ++pos;
237  }
238 
239  node *operator[](const t_index idx) const { return Z + idx; }
240 
241  /* Define several methods to postprocess the distances. All these functions
242  are monotone, so they do not change the sorted order of distances. */
243 
244  void sqrt() const
245  {
246  for (node *ZZ = Z; ZZ != Z + pos; ++ZZ) {
247  ZZ->dist = std::sqrt(ZZ->dist);
248  }
249  }
250 
251  void sqrt(const t_float) const
252  { // ignore the argument
253  sqrt();
254  }
255 
256  void sqrtdouble(const t_float) const
257  { // ignore the argument
258  for (node *ZZ = Z; ZZ != Z + pos; ++ZZ) {
259  ZZ->dist = std::sqrt(2 * ZZ->dist);
260  }
261  }
262 
263 #ifdef R_pow
264 #define my_pow R_pow
265 #else
266 #define my_pow std::pow
267 #endif
268 
269  void power(const t_float p) const
270  {
271  t_float const q = 1 / p;
272  for (node *ZZ = Z; ZZ != Z + pos; ++ZZ) {
273  ZZ->dist = my_pow(ZZ->dist, q);
274  }
275  }
276 
277  void plusone(const t_float) const
278  { // ignore the argument
279  for (node *ZZ = Z; ZZ != Z + pos; ++ZZ) {
280  ZZ->dist += 1;
281  }
282  }
283 
284  void divide(const t_float denom) const
285  {
286  for (node *ZZ = Z; ZZ != Z + pos; ++ZZ) {
287  ZZ->dist /= denom;
288  }
289  }
290 };
291 
292 class doubly_linked_list {
293  /*
294  Class for a doubly linked list. Initially, the list is the integer range
295  [0, size]. We provide a forward iterator and a method to delete an index
296  from the list.
297 
298  Typical use: for (i=L.start; L<size; i=L.succ[I])
299  or
300  for (i=somevalue; L<size; i=L.succ[I])
301  */
302 public:
303  t_index start;
305 
306 private:
308  // Not necessarily private, we just do not need it in this instance.
309 
310 public:
312  // Initialize to the given size.
313  : start(0), succ(size + 1), pred(size + 1)
314  {
315  for (t_index i = 0; i < size; ++i) {
316  pred[i + 1] = i;
317  succ[i] = i + 1;
318  }
319  // pred[0] is never accessed!
320  // succ[size] is never accessed!
321  }
322 
324 
325  void remove(const t_index idx)
326  {
327  // Remove an index from the list.
328  if (idx == start) {
329  start = succ[idx];
330  } else {
331  succ[pred[idx]] = succ[idx];
332  pred[succ[idx]] = pred[idx];
333  }
334  succ[idx] = 0; // Mark as inactive
335  }
336 
337  bool is_inactive(t_index idx) const { return (succ[idx] == 0); }
338 };
339 
340 // Indexing functions
341 // D is the upper triangular part of a symmetric (NxN)-matrix
342 // We require r_ < c_ !
343 #define D_(r_, c_) (D[(static_cast<std::ptrdiff_t>(2 * N - 3 - (r_)) * (r_) >> 1) + (c_)-1])
344 // Z is an ((N-1)x4)-array
345 #define Z_(_r, _c) (Z[(_r)*4 + (_c)])
346 
347 /*
348  Lookup function for a union-find data structure.
349 
350  The function finds the root of idx by going iteratively through all
351  parent elements until a root is found. An element i is a root if
352  nodes[i] is zero. To make subsequent searches faster, the entry for
353  idx and all its parents is updated with the root element.
354  */
355 class union_find {
356 private:
358  t_index nextparent;
359 
360 public:
361  union_find(const t_index size) : parent(size > 0 ? 2 * size - 1 : 0, 0), nextparent(size) {}
362 
363  t_index Find(t_index idx) const
364  {
365  if (parent[idx] != 0) { // NOLINT a → b
366  t_index p = idx;
367  idx = parent[idx];
368  if (parent[idx] != 0) { // a → b → c
369  do {
370  idx = parent[idx];
371  } while (parent[idx] != 0);
372  do {
373  t_index tmp = parent[p];
374  parent[p] = idx;
375  p = tmp;
376  } while (parent[p] != idx);
377  }
378  }
379  return idx;
380  }
381 
382  void Union(const t_index node1, const t_index node2) { parent[node1] = parent[node2] = nextparent++; }
383 };
384 
385 class nan_error {
386 };
387 #ifdef FE_INVALID
388 class fenv_error {
389 };
390 #endif
391 
392 static void MST_linkage_core(const t_index N, const t_float *const D, cluster_result &Z2)
393 {
394  /*
395  N: integer, number of data points
396  D: condensed distance matrix N*(N-1)/2
397  Z2: output data structure
398 
399  The basis of this algorithm is an algorithm by Rohlf:
400 
401  F. James Rohlf, Hierarchical clustering using the minimum spanning tree,
402  The Computer Journal, vol. 16, 1973, p. 93–95.
403  */
404  t_index i;
405  t_index idx2;
406  doubly_linked_list active_nodes(N);
408 
409  t_index prev_node;
410  t_float min;
411 
412  // first iteration
413  idx2 = 1;
414  min = std::numeric_limits<t_float>::infinity();
415  for (i = 1; i < N; ++i) {
416  d[i] = D[i - 1];
417 #if HAVE_DIAGNOSTIC
418 #pragma GCC diagnostic push
419 #pragma GCC diagnostic ignored "-Wfloat-equal"
420 #endif
421  if (d[i] < min) {
422  min = d[i];
423  idx2 = i;
424  } else if (fc_isnan(d[i]))
425  throw(nan_error());
426 #if HAVE_DIAGNOSTIC
427 #pragma GCC diagnostic pop
428 #endif
429  }
430  Z2.append(0, idx2, min);
431 
432  for (t_index j = 1; j < N - 1; ++j) {
433  prev_node = idx2;
434  active_nodes.remove(prev_node);
435 
436  idx2 = active_nodes.succ[0];
437  min = d[idx2];
438  for (i = idx2; i < prev_node; i = active_nodes.succ[i]) {
439  t_float tmp = D_(i, prev_node);
440 #if HAVE_DIAGNOSTIC
441 #pragma GCC diagnostic push
442 #pragma GCC diagnostic ignored "-Wfloat-equal"
443 #endif
444  if (tmp < d[i])
445  d[i] = tmp;
446  else if (fc_isnan(tmp))
447  throw(nan_error());
448 #if HAVE_DIAGNOSTIC
449 #pragma GCC diagnostic pop
450 #endif
451  if (d[i] < min) {
452  min = d[i];
453  idx2 = i;
454  }
455  }
456  for (; i < N; i = active_nodes.succ[i]) {
457  t_float tmp = D_(prev_node, i);
458 #if HAVE_DIAGNOSTIC
459 #pragma GCC diagnostic push
460 #pragma GCC diagnostic ignored "-Wfloat-equal"
461 #endif
462  if (d[i] > tmp)
463  d[i] = tmp;
464  else if (fc_isnan(tmp))
465  throw(nan_error());
466 #if HAVE_DIAGNOSTIC
467 #pragma GCC diagnostic pop
468 #endif
469  if (d[i] < min) {
470  min = d[i];
471  idx2 = i;
472  }
473  }
474  Z2.append(prev_node, idx2, min);
475  }
476 }
477 
478 /* Functions for the update of the dissimilarity array */
479 
480 inline static void f_single(t_float *const b, const t_float a)
481 {
482  if (*b > a)
483  *b = a;
484 }
485 inline static void f_complete(t_float *const b, const t_float a)
486 {
487  if (*b < a)
488  *b = a;
489 }
490 inline static void f_average(t_float *const b, const t_float a, const t_float s, const t_float t)
491 {
492  *b = s * a + t * (*b);
493 #ifndef FE_INVALID
494 #if HAVE_DIAGNOSTIC
495 #pragma GCC diagnostic push
496 #pragma GCC diagnostic ignored "-Wfloat-equal"
497 #endif
498  if (fc_isnan(*b)) {
499  throw(nan_error());
500  }
501 #if HAVE_DIAGNOSTIC
502 #pragma GCC diagnostic pop
503 #endif
504 #endif
505 }
506 inline static void f_weighted(t_float *const b, const t_float a)
507 {
508  *b = (a + *b) * .5;
509 #ifndef FE_INVALID
510 #if HAVE_DIAGNOSTIC
511 #pragma GCC diagnostic push
512 #pragma GCC diagnostic ignored "-Wfloat-equal"
513 #endif
514  if (fc_isnan(*b)) {
515  throw(nan_error());
516  }
517 #if HAVE_DIAGNOSTIC
518 #pragma GCC diagnostic pop
519 #endif
520 #endif
521 }
522 inline static void
523 f_ward(t_float *const b, const t_float a, const t_float c, const t_float s, const t_float t, const t_float v)
524 {
525  *b = ((v + s) * a - v * c + (v + t) * (*b)) / (s + t + v);
526 //*b = a+(*b)-(t*a+s*(*b)+v*c)/(s+t+v);
527 #ifndef FE_INVALID
528 #if HAVE_DIAGNOSTIC
529 #pragma GCC diagnostic push
530 #pragma GCC diagnostic ignored "-Wfloat-equal"
531 #endif
532  if (fc_isnan(*b)) {
533  throw(nan_error());
534  }
535 #if HAVE_DIAGNOSTIC
536 #pragma GCC diagnostic pop
537 #endif
538 #endif
539 }
540 inline static void f_centroid(t_float *const b, const t_float a, const t_float stc, const t_float s, const t_float t)
541 {
542  *b = s * a - stc + t * (*b);
543 #ifndef FE_INVALID
544  if (fc_isnan(*b)) {
545  throw(nan_error());
546  }
547 #if HAVE_DIAGNOSTIC
548 #pragma GCC diagnostic pop
549 #endif
550 #endif
551 }
552 inline static void f_median(t_float *const b, const t_float a, const t_float c_4)
553 {
554  *b = (a + (*b)) * .5 - c_4;
555 #ifndef FE_INVALID
556 #if HAVE_DIAGNOSTIC
557 #pragma GCC diagnostic push
558 #pragma GCC diagnostic ignored "-Wfloat-equal"
559 #endif
560  if (fc_isnan(*b)) {
561  throw(nan_error());
562  }
563 #if HAVE_DIAGNOSTIC
564 #pragma GCC diagnostic pop
565 #endif
566 #endif
567 }
568 
569 template <method_codes method, typename t_members>
570 static void NN_chain_core(const t_index N, t_float *const D, t_members *const members, cluster_result &Z2)
571 {
572  /*
573  N: integer
574  D: condensed distance matrix N*(N-1)/2
575  Z2: output data structure
576 
577  This is the NN-chain algorithm, described on page 86 in the following book:
578 
579  Fionn Murtagh, Multidimensional Clustering Algorithms,
580  Vienna, Würzburg: Physica-Verlag, 1985.
581  */
582  t_index i;
583 
584  auto_array_ptr<t_index> NN_chain(N);
585  t_index NN_chain_tip = 0;
586 
587  t_index idx1, idx2;
588 
589  t_float size1, size2;
590  doubly_linked_list active_nodes(N);
591 
592  t_float min;
593 
594  for (t_float const *DD = D; DD != D + (static_cast<std::ptrdiff_t>(N) * (N - 1) >> 1); ++DD) {
595 #if HAVE_DIAGNOSTIC
596 #pragma GCC diagnostic push
597 #pragma GCC diagnostic ignored "-Wfloat-equal"
598 #endif
599  if (fc_isnan(*DD)) {
600  throw(nan_error());
601  }
602 #if HAVE_DIAGNOSTIC
603 #pragma GCC diagnostic pop
604 #endif
605  }
606 
607 #ifdef FE_INVALID
608  if (feclearexcept(FE_INVALID))
609  throw fenv_error();
610 #endif
611 
612  for (t_index j = 0; j < N - 1; ++j) {
613  if (NN_chain_tip <= 3) {
614  NN_chain[0] = idx1 = active_nodes.start;
615  NN_chain_tip = 1;
616 
617  idx2 = active_nodes.succ[idx1];
618  min = D_(idx1, idx2);
619  for (i = active_nodes.succ[idx2]; i < N; i = active_nodes.succ[i]) {
620  if (D_(idx1, i) < min) {
621  min = D_(idx1, i);
622  idx2 = i;
623  }
624  }
625  } // a: idx1 b: idx2
626  else {
627  NN_chain_tip -= 3;
628  idx1 = NN_chain[NN_chain_tip - 1];
629  idx2 = NN_chain[NN_chain_tip];
630  min = idx1 < idx2 ? D_(idx1, idx2) : D_(idx2, idx1);
631  } // a: idx1 b: idx2
632 
633  do {
634  NN_chain[NN_chain_tip] = idx2;
635 
636  for (i = active_nodes.start; i < idx2; i = active_nodes.succ[i]) {
637  if (D_(i, idx2) < min) {
638  min = D_(i, idx2);
639  idx1 = i;
640  }
641  }
642  for (i = active_nodes.succ[idx2]; i < N; i = active_nodes.succ[i]) {
643  if (D_(idx2, i) < min) {
644  min = D_(idx2, i);
645  idx1 = i;
646  }
647  }
648 
649  idx2 = idx1;
650  idx1 = NN_chain[NN_chain_tip++];
651 
652  } while (idx2 != NN_chain[NN_chain_tip - 2]);
653 
654  Z2.append(idx1, idx2, min);
655 
656  if (idx1 > idx2) {
657  t_index tmp = idx1;
658  idx1 = idx2;
659  idx2 = tmp;
660  }
661 
662  if (method == METHOD_METR_AVERAGE || method == METHOD_METR_WARD) {
663  size1 = static_cast<t_float>(members[idx1]);
664  size2 = static_cast<t_float>(members[idx2]);
665  members[idx2] += members[idx1];
666  }
667 
668  // Remove the smaller index from the valid indices (active_nodes).
669  active_nodes.remove(idx1);
670 
671  switch (method) {
672  case METHOD_METR_SINGLE:
673  /*
674  Single linkage.
675 
676  Characteristic: new distances are never longer than the old distances.
677  */
678  // Update the distance matrix in the range [start, idx1).
679  for (i = active_nodes.start; i < idx1; i = active_nodes.succ[i])
680  f_single(&D_(i, idx2), D_(i, idx1));
681  // Update the distance matrix in the range (idx1, idx2).
682  for (; i < idx2; i = active_nodes.succ[i])
683  f_single(&D_(i, idx2), D_(idx1, i));
684  // Update the distance matrix in the range (idx2, N).
685  for (i = active_nodes.succ[idx2]; i < N; i = active_nodes.succ[i])
686  f_single(&D_(idx2, i), D_(idx1, i));
687  break;
688 
690  /*
691  Complete linkage.
692 
693  Characteristic: new distances are never shorter than the old distances.
694  */
695  // Update the distance matrix in the range [start, idx1).
696  for (i = active_nodes.start; i < idx1; i = active_nodes.succ[i])
697  f_complete(&D_(i, idx2), D_(i, idx1));
698  // Update the distance matrix in the range (idx1, idx2).
699  for (; i < idx2; i = active_nodes.succ[i])
700  f_complete(&D_(i, idx2), D_(idx1, i));
701  // Update the distance matrix in the range (idx2, N).
702  for (i = active_nodes.succ[idx2]; i < N; i = active_nodes.succ[i])
703  f_complete(&D_(idx2, i), D_(idx1, i));
704  break;
705 
706  case METHOD_METR_AVERAGE: {
707  /*
708  Average linkage.
709 
710  Shorter and longer distances can occur.
711  */
712  // Update the distance matrix in the range [start, idx1).
713  t_float s = size1 / (size1 + size2);
714  t_float t = size2 / (size1 + size2);
715  for (i = active_nodes.start; i < idx1; i = active_nodes.succ[i])
716  f_average(&D_(i, idx2), D_(i, idx1), s, t);
717  // Update the distance matrix in the range (idx1, idx2).
718  for (; i < idx2; i = active_nodes.succ[i])
719  f_average(&D_(i, idx2), D_(idx1, i), s, t);
720  // Update the distance matrix in the range (idx2, N).
721  for (i = active_nodes.succ[idx2]; i < N; i = active_nodes.succ[i])
722  f_average(&D_(idx2, i), D_(idx1, i), s, t);
723  break;
724  }
725 
727  /*
728  Weighted linkage.
729 
730  Shorter and longer distances can occur.
731  */
732  // Update the distance matrix in the range [start, idx1).
733  for (i = active_nodes.start; i < idx1; i = active_nodes.succ[i])
734  f_weighted(&D_(i, idx2), D_(i, idx1));
735  // Update the distance matrix in the range (idx1, idx2).
736  for (; i < idx2; i = active_nodes.succ[i])
737  f_weighted(&D_(i, idx2), D_(idx1, i));
738  // Update the distance matrix in the range (idx2, N).
739  for (i = active_nodes.succ[idx2]; i < N; i = active_nodes.succ[i])
740  f_weighted(&D_(idx2, i), D_(idx1, i));
741  break;
742 
743  case METHOD_METR_WARD:
744  /*
745  Ward linkage.
746 
747  Shorter and longer distances can occur, not smaller than min(d1,d2)
748  but maybe bigger than max(d1,d2).
749  */
750  // Update the distance matrix in the range [start, idx1).
751  // t_float v = static_cast<t_float>(members[i]);
752  for (i = active_nodes.start; i < idx1; i = active_nodes.succ[i])
753  f_ward(&D_(i, idx2), D_(i, idx1), min, size1, size2, static_cast<t_float>(members[i]));
754  // Update the distance matrix in the range (idx1, idx2).
755  for (; i < idx2; i = active_nodes.succ[i])
756  f_ward(&D_(i, idx2), D_(idx1, i), min, size1, size2, static_cast<t_float>(members[i]));
757  // Update the distance matrix in the range (idx2, N).
758  for (i = active_nodes.succ[idx2]; i < N; i = active_nodes.succ[i])
759  f_ward(&D_(idx2, i), D_(idx1, i), min, size1, size2, static_cast<t_float>(members[i]));
760  break;
761 
762  default: throw std::runtime_error(std::string("Invalid method."));
763  }
764  }
765 #ifdef FE_INVALID
766  if (fetestexcept(FE_INVALID))
767  throw fenv_error();
768 #endif
769 }
770 
771 class binary_min_heap {
772  /*
773  Class for a binary min-heap. The data resides in an array A. The elements of
774  A are not changed but two lists I and R of indices are generated which point
775  to elements of A and backwards.
776 
777  The heap tree structure is
778 
779  H[2*i+1] H[2*i+2]
780  \ /
781  \ /
782  ≤ ≤
783  \ /
784  \ /
785  H[i]
786 
787  where the children must be less or equal than their parent. Thus, H[0]
788  contains the minimum. The lists I and R are made such that H[i] = A[I[i]]
789  and R[I[i]] = i.
790 
791  This implementation is not designed to handle NaN values.
792  */
793 private:
794  t_float *const A;
795  t_index size;
798 
799  // no default constructor
800  binary_min_heap();
801  // noncopyable
803  binary_min_heap &operator=(binary_min_heap const &);
804 
805 public:
806  binary_min_heap(t_float *const A_, const t_index size_) : A(A_), size(size_), I(size), R(size)
807  { // Allocate memory and initialize the lists I and R to the identity. This
808  // does not make it a heap. Call heapify afterwards!
809  for (t_index i = 0; i < size; ++i)
810  R[i] = I[i] = i;
811  }
812 
813  binary_min_heap(t_float *const A_, const t_index size1, const t_index size2, const t_index start)
814  : A(A_), size(size1), I(size1), R(size2)
815  { // Allocate memory and initialize the lists I and R to the identity. This
816  // does not make it a heap. Call heapify afterwards!
817  for (t_index i = 0; i < size; ++i) {
818  R[i + start] = i;
819  I[i] = i + start;
820  }
821  }
822 
824 
825  void heapify()
826  {
827  // Arrange the indices I and R so that H[i] := A[I[i]] satisfies the heap
828  // condition H[i] < H[2*i+1] and H[i] < H[2*i+2] for each i.
829  //
830  // Complexity: Θ(size)
831  // Reference: Cormen, Leiserson, Rivest, Stein, Introduction to Algorithms,
832  // 3rd ed., 2009, Section 6.3 “Building a heap”
833  t_index idx;
834  for (idx = (size >> 1); idx > 0;) {
835  --idx;
836  update_geq_(idx);
837  }
838  }
839 
840  inline t_index argmin() const
841  {
842  // Return the minimal element.
843  return I[0];
844  }
845 
846  void heap_pop()
847  {
848  // Remove the minimal element from the heap.
849  --size;
850  I[0] = I[size];
851  R[I[0]] = 0;
852  update_geq_(0);
853  }
854 
855  void remove(t_index idx)
856  {
857  // Remove an element from the heap.
858  --size;
859  R[I[size]] = R[idx];
860  I[R[idx]] = I[size];
861  if (H(size) <= A[idx]) {
862  update_leq_(R[idx]);
863  } else {
864  update_geq_(R[idx]);
865  }
866  }
867 
868  void replace(const t_index idxold, const t_index idxnew, const t_float val)
869  {
870  R[idxnew] = R[idxold];
871  I[R[idxnew]] = idxnew;
872  if (val <= A[idxold])
873  update_leq(idxnew, val);
874  else
875  update_geq(idxnew, val);
876  }
877 
878  void update(const t_index idx, const t_float val) const
879  {
880  // Update the element A[i] with val and re-arrange the indices to preserve
881  // the heap condition.
882  if (val <= A[idx])
883  update_leq(idx, val);
884  else
885  update_geq(idx, val);
886  }
887 
888  void update_leq(const t_index idx, const t_float val) const
889  {
890  // Use this when the new value is not more than the old value.
891  A[idx] = val;
892  update_leq_(R[idx]);
893  }
894 
895  void update_geq(const t_index idx, const t_float val) const
896  {
897  // Use this when the new value is not less than the old value.
898  A[idx] = val;
899  update_geq_(R[idx]);
900  }
901 
902 private:
903  void update_leq_(t_index i) const
904  {
905  t_index j;
906  for (; (i > 0) && (H(i) < H(j = (i - 1) >> 1)); i = j)
907  heap_swap(i, j);
908  }
909 
910  void update_geq_(t_index i) const
911  {
912  t_index j;
913  for (; (j = 2 * i + 1) < size; i = j) {
914  if (H(j) >= H(i)) {
915  ++j;
916  if (j >= size || H(j) >= H(i))
917  break;
918  } else if (j + 1 < size && H(j + 1) < H(j))
919  ++j;
920  heap_swap(i, j);
921  }
922  }
923 
924  void heap_swap(const t_index i, const t_index j) const
925  {
926  // Swap two indices.
927  t_index tmp = I[i];
928  I[i] = I[j];
929  I[j] = tmp;
930  R[I[i]] = i;
931  R[I[j]] = j;
932  }
933 
934  inline t_float H(const t_index i) const { return A[I[i]]; }
935 };
936 
937 template <method_codes method, typename t_members>
938 static void generic_linkage(const t_index N, t_float *const D, t_members *const members, cluster_result &Z2)
939 {
940  /*
941  N: integer, number of data points
942  D: condensed distance matrix N*(N-1)/2
943  Z2: output data structure
944  */
945 
946  const t_index N_1 = N - 1;
947  t_index i, j; // loop variables
948  t_index idx1, idx2; // row and column indices
949 
950  auto_array_ptr<t_index> n_nghbr(N_1); // array of nearest neighbors
951  auto_array_ptr<t_float> mindist(N_1); // distances to the nearest neighbors
952  auto_array_ptr<t_index> row_repr(N); // row_repr[i]: node number that the
953  // i-th row represents
954  doubly_linked_list active_nodes(N);
955  binary_min_heap nn_distances(&*mindist, N_1); // minimum heap structure for
956  // the distance to the nearest neighbor of each point
957  t_index node1, node2; // node numbers in the output
958  t_float size1, size2; // and their cardinalities
959 
960  t_float min; // minimum and row index for nearest-neighbor search
961  t_index idx;
962 
963  for (i = 0; i < N; ++i)
964  // Build a list of row ↔ node label assignments.
965  // Initially i ↦ i
966  row_repr[i] = i;
967 
968  // Initialize the minimal distances:
969  // Find the nearest neighbor of each point.
970  // n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1)
971  t_float const *DD = D;
972  for (i = 0; i < N_1; ++i) {
973  min = std::numeric_limits<t_float>::infinity();
974  for (idx = j = i + 1; j < N; ++j, ++DD) {
975 #if HAVE_DIAGNOSTIC
976 #pragma GCC diagnostic push
977 #pragma GCC diagnostic ignored "-Wfloat-equal"
978 #endif
979  if (*DD < min) {
980  min = *DD;
981  idx = j;
982  } else if (fc_isnan(*DD))
983  throw(nan_error());
984  }
985 #if HAVE_DIAGNOSTIC
986 #pragma GCC diagnostic pop
987 #endif
988  mindist[i] = min;
989  n_nghbr[i] = idx;
990  }
991 
992  // Put the minimal distances into a heap structure to make the repeated
993  // global minimum searches fast.
994  nn_distances.heapify();
995 
996 #ifdef FE_INVALID
997  if (feclearexcept(FE_INVALID))
998  throw fenv_error();
999 #endif
1000 
1001  // Main loop: We have N-1 merging steps.
1002  for (i = 0; i < N_1; ++i) {
1003  /*
1004  Here is a special feature that allows fast bookkeeping and updates of the
1005  minimal distances.
1006 
1007  mindist[i] stores a lower bound on the minimum distance of the point i to
1008  all points of higher index:
1009 
1010  mindist[i] ≥ min_{j>i} D(i,j)
1011 
1012  Normally, we have equality. However, this minimum may become invalid due
1013  to the updates in the distance matrix. The rules are:
1014 
1015  1) If mindist[i] is equal to D(i, n_nghbr[i]), this is the correct
1016  minimum and n_nghbr[i] is a nearest neighbor.
1017 
1018  2) If mindist[i] is smaller than D(i, n_nghbr[i]), this might not be the
1019  correct minimum. The minimum needs to be recomputed.
1020 
1021  3) mindist[i] is never bigger than the true minimum. Hence, we never
1022  miss the true minimum if we take the smallest mindist entry,
1023  re-compute the value if necessary (thus maybe increasing it) and
1024  looking for the now smallest mindist entry until a valid minimal
1025  entry is found. This step is done in the lines below.
1026 
1027  The update process for D below takes care that these rules are
1028  fulfilled. This makes sure that the minima in the rows D(i,i+1:)of D are
1029  re-calculated when necessary but re-calculation is avoided whenever
1030  possible.
1031 
1032  The re-calculation of the minima makes the worst-case runtime of this
1033  algorithm cubic in N. We avoid this whenever possible, and in most cases
1034  the runtime appears to be quadratic.
1035  */
1036  idx1 = nn_distances.argmin();
1037  if (method != METHOD_METR_SINGLE) {
1038  while (mindist[idx1] < D_(idx1, n_nghbr[idx1])) {
1039  // Recompute the minimum mindist[idx1] and n_nghbr[idx1].
1040  n_nghbr[idx1] = j = active_nodes.succ[idx1]; // exists, maximally N-1
1041  min = D_(idx1, j);
1042  for (j = active_nodes.succ[j]; j < N; j = active_nodes.succ[j]) {
1043  if (D_(idx1, j) < min) {
1044  min = D_(idx1, j);
1045  n_nghbr[idx1] = j;
1046  }
1047  }
1048  /* Update the heap with the new true minimum and search for the
1049  (possibly different) minimal entry. */
1050  nn_distances.update_geq(idx1, min);
1051  idx1 = nn_distances.argmin();
1052  }
1053  }
1054 
1055  nn_distances.heap_pop(); // Remove the current minimum from the heap.
1056  idx2 = n_nghbr[idx1];
1057 
1058  // Write the newly found minimal pair of nodes to the output array.
1059  node1 = row_repr[idx1];
1060  node2 = row_repr[idx2];
1061 
1062  if (method == METHOD_METR_AVERAGE || method == METHOD_METR_WARD || method == METHOD_METR_CENTROID) {
1063  size1 = static_cast<t_float>(members[idx1]);
1064  size2 = static_cast<t_float>(members[idx2]);
1065  members[idx2] += members[idx1];
1066  }
1067  Z2.append(node1, node2, mindist[idx1]);
1068 
1069  // Remove idx1 from the list of active indices (active_nodes).
1070  active_nodes.remove(idx1);
1071  // Index idx2 now represents the new (merged) node with label N+i.
1072  row_repr[idx2] = N + i;
1073 
1074  // Update the distance matrix
1075  switch (method) {
1076  case METHOD_METR_SINGLE:
1077  /*
1078  Single linkage.
1079 
1080  Characteristic: new distances are never longer than the old distances.
1081  */
1082  // Update the distance matrix in the range [start, idx1).
1083  for (j = active_nodes.start; j < idx1; j = active_nodes.succ[j]) {
1084  f_single(&D_(j, idx2), D_(j, idx1));
1085  if (n_nghbr[j] == idx1)
1086  n_nghbr[j] = idx2;
1087  }
1088  // Update the distance matrix in the range (idx1, idx2).
1089  for (; j < idx2; j = active_nodes.succ[j]) {
1090  f_single(&D_(j, idx2), D_(idx1, j));
1091  // If the new value is below the old minimum in a row, update
1092  // the mindist and n_nghbr arrays.
1093  if (D_(j, idx2) < mindist[j]) {
1094  nn_distances.update_leq(j, D_(j, idx2));
1095  n_nghbr[j] = idx2;
1096  }
1097  }
1098  // Update the distance matrix in the range (idx2, N).
1099  // Recompute the minimum mindist[idx2] and n_nghbr[idx2].
1100  if (idx2 < N_1) {
1101  min = mindist[idx2];
1102  for (j = active_nodes.succ[idx2]; j < N; j = active_nodes.succ[j]) {
1103  f_single(&D_(idx2, j), D_(idx1, j));
1104  if (D_(idx2, j) < min) {
1105  n_nghbr[idx2] = j;
1106  min = D_(idx2, j);
1107  }
1108  }
1109  nn_distances.update_leq(idx2, min);
1110  }
1111  break;
1112 
1113  case METHOD_METR_COMPLETE:
1114  /*
1115  Complete linkage.
1116 
1117  Characteristic: new distances are never shorter than the old distances.
1118  */
1119  // Update the distance matrix in the range [start, idx1).
1120  for (j = active_nodes.start; j < idx1; j = active_nodes.succ[j]) {
1121  f_complete(&D_(j, idx2), D_(j, idx1));
1122  if (n_nghbr[j] == idx1)
1123  n_nghbr[j] = idx2;
1124  }
1125  // Update the distance matrix in the range (idx1, idx2).
1126  for (; j < idx2; j = active_nodes.succ[j])
1127  f_complete(&D_(j, idx2), D_(idx1, j));
1128  // Update the distance matrix in the range (idx2, N).
1129  for (j = active_nodes.succ[idx2]; j < N; j = active_nodes.succ[j])
1130  f_complete(&D_(idx2, j), D_(idx1, j));
1131  break;
1132 
1133  case METHOD_METR_AVERAGE: {
1134  /*
1135  Average linkage.
1136 
1137  Shorter and longer distances can occur.
1138  */
1139  // Update the distance matrix in the range [start, idx1).
1140  t_float s = size1 / (size1 + size2);
1141  t_float t = size2 / (size1 + size2);
1142  for (j = active_nodes.start; j < idx1; j = active_nodes.succ[j]) {
1143  f_average(&D_(j, idx2), D_(j, idx1), s, t);
1144  if (n_nghbr[j] == idx1)
1145  n_nghbr[j] = idx2;
1146  }
1147  // Update the distance matrix in the range (idx1, idx2).
1148  for (; j < idx2; j = active_nodes.succ[j]) {
1149  f_average(&D_(j, idx2), D_(idx1, j), s, t);
1150  if (D_(j, idx2) < mindist[j]) {
1151  nn_distances.update_leq(j, D_(j, idx2));
1152  n_nghbr[j] = idx2;
1153  }
1154  }
1155  // Update the distance matrix in the range (idx2, N).
1156  if (idx2 < N_1) {
1157  n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1158  f_average(&D_(idx2, j), D_(idx1, j), s, t);
1159  min = D_(idx2, j);
1160  for (j = active_nodes.succ[j]; j < N; j = active_nodes.succ[j]) {
1161  f_average(&D_(idx2, j), D_(idx1, j), s, t);
1162  if (D_(idx2, j) < min) {
1163  min = D_(idx2, j);
1164  n_nghbr[idx2] = j;
1165  }
1166  }
1167  nn_distances.update(idx2, min);
1168  }
1169  break;
1170  }
1171 
1172  case METHOD_METR_WEIGHTED:
1173  /*
1174  Weighted linkage.
1175 
1176  Shorter and longer distances can occur.
1177  */
1178  // Update the distance matrix in the range [start, idx1).
1179  for (j = active_nodes.start; j < idx1; j = active_nodes.succ[j]) {
1180  f_weighted(&D_(j, idx2), D_(j, idx1));
1181  if (n_nghbr[j] == idx1)
1182  n_nghbr[j] = idx2;
1183  }
1184  // Update the distance matrix in the range (idx1, idx2).
1185  for (; j < idx2; j = active_nodes.succ[j]) {
1186  f_weighted(&D_(j, idx2), D_(idx1, j));
1187  if (D_(j, idx2) < mindist[j]) {
1188  nn_distances.update_leq(j, D_(j, idx2));
1189  n_nghbr[j] = idx2;
1190  }
1191  }
1192  // Update the distance matrix in the range (idx2, N).
1193  if (idx2 < N_1) {
1194  n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1195  f_weighted(&D_(idx2, j), D_(idx1, j));
1196  min = D_(idx2, j);
1197  for (j = active_nodes.succ[j]; j < N; j = active_nodes.succ[j]) {
1198  f_weighted(&D_(idx2, j), D_(idx1, j));
1199  if (D_(idx2, j) < min) {
1200  min = D_(idx2, j);
1201  n_nghbr[idx2] = j;
1202  }
1203  }
1204  nn_distances.update(idx2, min);
1205  }
1206  break;
1207 
1208  case METHOD_METR_WARD:
1209  /*
1210  Ward linkage.
1211 
1212  Shorter and longer distances can occur, not smaller than min(d1,d2)
1213  but maybe bigger than max(d1,d2).
1214  */
1215  // Update the distance matrix in the range [start, idx1).
1216  for (j = active_nodes.start; j < idx1; j = active_nodes.succ[j]) {
1217  f_ward(&D_(j, idx2), D_(j, idx1), mindist[idx1], size1, size2, static_cast<t_float>(members[j]));
1218  if (n_nghbr[j] == idx1)
1219  n_nghbr[j] = idx2;
1220  }
1221  // Update the distance matrix in the range (idx1, idx2).
1222  for (; j < idx2; j = active_nodes.succ[j]) {
1223  f_ward(&D_(j, idx2), D_(idx1, j), mindist[idx1], size1, size2, static_cast<t_float>(members[j]));
1224  if (D_(j, idx2) < mindist[j]) {
1225  nn_distances.update_leq(j, D_(j, idx2));
1226  n_nghbr[j] = idx2;
1227  }
1228  }
1229  // Update the distance matrix in the range (idx2, N).
1230  if (idx2 < N_1) {
1231  n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1232  f_ward(&D_(idx2, j), D_(idx1, j), mindist[idx1], size1, size2, static_cast<t_float>(members[j]));
1233  min = D_(idx2, j);
1234  for (j = active_nodes.succ[j]; j < N; j = active_nodes.succ[j]) {
1235  f_ward(&D_(idx2, j), D_(idx1, j), mindist[idx1], size1, size2, static_cast<t_float>(members[j]));
1236  if (D_(idx2, j) < min) {
1237  min = D_(idx2, j);
1238  n_nghbr[idx2] = j;
1239  }
1240  }
1241  nn_distances.update(idx2, min);
1242  }
1243  break;
1244 
1245  case METHOD_METR_CENTROID: {
1246  /*
1247  Centroid linkage.
1248 
1249  Shorter and longer distances can occur, not bigger than max(d1,d2)
1250  but maybe smaller than min(d1,d2).
1251  */
1252  // Update the distance matrix in the range [start, idx1).
1253  t_float s = size1 / (size1 + size2);
1254  t_float t = size2 / (size1 + size2);
1255  t_float stc = s * t * mindist[idx1];
1256  for (j = active_nodes.start; j < idx1; j = active_nodes.succ[j]) {
1257  f_centroid(&D_(j, idx2), D_(j, idx1), stc, s, t);
1258  if (D_(j, idx2) < mindist[j]) {
1259  nn_distances.update_leq(j, D_(j, idx2));
1260  n_nghbr[j] = idx2;
1261  } else if (n_nghbr[j] == idx1)
1262  n_nghbr[j] = idx2;
1263  }
1264  // Update the distance matrix in the range (idx1, idx2).
1265  for (; j < idx2; j = active_nodes.succ[j]) {
1266  f_centroid(&D_(j, idx2), D_(idx1, j), stc, s, t);
1267  if (D_(j, idx2) < mindist[j]) {
1268  nn_distances.update_leq(j, D_(j, idx2));
1269  n_nghbr[j] = idx2;
1270  }
1271  }
1272  // Update the distance matrix in the range (idx2, N).
1273  if (idx2 < N_1) {
1274  n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1275  f_centroid(&D_(idx2, j), D_(idx1, j), stc, s, t);
1276  min = D_(idx2, j);
1277  for (j = active_nodes.succ[j]; j < N; j = active_nodes.succ[j]) {
1278  f_centroid(&D_(idx2, j), D_(idx1, j), stc, s, t);
1279  if (D_(idx2, j) < min) {
1280  min = D_(idx2, j);
1281  n_nghbr[idx2] = j;
1282  }
1283  }
1284  nn_distances.update(idx2, min);
1285  }
1286  break;
1287  }
1288 
1289  case METHOD_METR_MEDIAN: {
1290  /*
1291  Median linkage.
1292 
1293  Shorter and longer distances can occur, not bigger than max(d1,d2)
1294  but maybe smaller than min(d1,d2).
1295  */
1296  // Update the distance matrix in the range [start, idx1).
1297  t_float c_4 = mindist[idx1] * .25;
1298  for (j = active_nodes.start; j < idx1; j = active_nodes.succ[j]) {
1299  f_median(&D_(j, idx2), D_(j, idx1), c_4);
1300  if (D_(j, idx2) < mindist[j]) {
1301  nn_distances.update_leq(j, D_(j, idx2));
1302  n_nghbr[j] = idx2;
1303  } else if (n_nghbr[j] == idx1)
1304  n_nghbr[j] = idx2;
1305  }
1306  // Update the distance matrix in the range (idx1, idx2).
1307  for (; j < idx2; j = active_nodes.succ[j]) {
1308  f_median(&D_(j, idx2), D_(idx1, j), c_4);
1309  if (D_(j, idx2) < mindist[j]) {
1310  nn_distances.update_leq(j, D_(j, idx2));
1311  n_nghbr[j] = idx2;
1312  }
1313  }
1314  // Update the distance matrix in the range (idx2, N).
1315  if (idx2 < N_1) {
1316  n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1317  f_median(&D_(idx2, j), D_(idx1, j), c_4);
1318  min = D_(idx2, j);
1319  for (j = active_nodes.succ[j]; j < N; j = active_nodes.succ[j]) {
1320  f_median(&D_(idx2, j), D_(idx1, j), c_4);
1321  if (D_(idx2, j) < min) {
1322  min = D_(idx2, j);
1323  n_nghbr[idx2] = j;
1324  }
1325  }
1326  nn_distances.update(idx2, min);
1327  }
1328  break;
1329  }
1330 
1331  default: throw std::runtime_error(std::string("Invalid method."));
1332  }
1333  }
1334 #ifdef FE_INVALID
1335  if (fetestexcept(FE_INVALID))
1336  throw fenv_error();
1337 #endif
1338 }
1339 
1340 /*
1341  Clustering methods for vector data
1342 */
1343 
1344 template <typename t_dissimilarity>
1345 static void MST_linkage_core_vector(const t_index N, t_dissimilarity &dist, cluster_result &Z2)
1346 {
1347  /*
1348  N: integer, number of data points
1349  dist: function pointer to the metric
1350  Z2: output data structure
1351 
1352  The basis of this algorithm is an algorithm by Rohlf:
1353 
1354  F. James Rohlf, Hierarchical clustering using the minimum spanning tree,
1355  The Computer Journal, vol. 16, 1973, p. 93–95.
1356  */
1357  t_index i;
1358  t_index idx2;
1359  doubly_linked_list active_nodes(N);
1361 
1362  t_index prev_node;
1363  t_float min;
1364 
1365  // first iteration
1366  idx2 = 1;
1367  min = std::numeric_limits<t_float>::infinity();
1368  for (i = 1; i < N; ++i) {
1369  d[i] = dist(0, i);
1370 #if HAVE_DIAGNOSTIC
1371 #pragma GCC diagnostic push
1372 #pragma GCC diagnostic ignored "-Wfloat-equal"
1373 #endif
1374  if (d[i] < min) {
1375  min = d[i];
1376  idx2 = i;
1377  } else if (fc_isnan(d[i]))
1378  throw(nan_error());
1379 #if HAVE_DIAGNOSTIC
1380 #pragma GCC diagnostic pop
1381 #endif
1382  }
1383 
1384  Z2.append(0, idx2, min);
1385 
1386  for (t_index j = 1; j < N - 1; ++j) {
1387  prev_node = idx2;
1388  active_nodes.remove(prev_node);
1389 
1390  idx2 = active_nodes.succ[0];
1391  min = d[idx2];
1392 
1393  for (i = idx2; i < N; i = active_nodes.succ[i]) {
1394  t_float tmp = dist(i, prev_node);
1395 #if HAVE_DIAGNOSTIC
1396 #pragma GCC diagnostic push
1397 #pragma GCC diagnostic ignored "-Wfloat-equal"
1398 #endif
1399  if (d[i] > tmp)
1400  d[i] = tmp;
1401  else if (fc_isnan(tmp))
1402  throw(nan_error());
1403 #if HAVE_DIAGNOSTIC
1404 #pragma GCC diagnostic pop
1405 #endif
1406  if (d[i] < min) {
1407  min = d[i];
1408  idx2 = i;
1409  }
1410  }
1411  Z2.append(prev_node, idx2, min);
1412  }
1413 }
1414 
1415 template <method_codes_vector method, typename t_dissimilarity>
1416 static void generic_linkage_vector(const t_index N, t_dissimilarity &dist, cluster_result &Z2)
1417 {
1418  /*
1419  N: integer, number of data points
1420  dist: function pointer to the metric
1421  Z2: output data structure
1422 
1423  This algorithm is valid for the distance update methods
1424  "Ward", "centroid" and "median" only!
1425  */
1426  const t_index N_1 = N - 1;
1427  t_index i, j; // loop variables
1428  t_index idx1, idx2; // row and column indices
1429 
1430  auto_array_ptr<t_index> n_nghbr(N_1); // array of nearest neighbors
1431  auto_array_ptr<t_float> mindist(N_1); // distances to the nearest neighbors
1432  auto_array_ptr<t_index> row_repr(N); // row_repr[i]: node number that the
1433  // i-th row represents
1434  doubly_linked_list active_nodes(N);
1435  binary_min_heap nn_distances(&*mindist, N_1); // minimum heap structure for
1436  // the distance to the nearest neighbor of each point
1437  t_index node1, node2; // node numbers in the output
1438  t_float min; // minimum and row index for nearest-neighbor search
1439 
1440  for (i = 0; i < N; ++i)
1441  // Build a list of row ↔ node label assignments.
1442  // Initially i ↦ i
1443  row_repr[i] = i;
1444 
1445  // Initialize the minimal distances:
1446  // Find the nearest neighbor of each point.
1447  // n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1)
1448  for (i = 0; i < N_1; ++i) {
1449  min = std::numeric_limits<t_float>::infinity();
1450  t_index idx;
1451  for (idx = j = i + 1; j < N; ++j) {
1452  t_float tmp;
1453  switch (method) {
1454  case METHOD_VECTOR_WARD: tmp = dist.ward_initial(i, j); break;
1455  default: tmp = dist.template sqeuclidean<true>(i, j);
1456  }
1457  if (tmp < min) {
1458  min = tmp;
1459  idx = j;
1460  }
1461  }
1462  switch (method) {
1463  case METHOD_VECTOR_WARD: mindist[i] = t_dissimilarity::ward_initial_conversion(min); break;
1464  default: mindist[i] = min;
1465  }
1466  n_nghbr[i] = idx;
1467  }
1468 
1469  // Put the minimal distances into a heap structure to make the repeated
1470  // global minimum searches fast.
1471  nn_distances.heapify();
1472 
1473  // Main loop: We have N-1 merging steps.
1474  for (i = 0; i < N_1; ++i) {
1475  idx1 = nn_distances.argmin();
1476 
1477  while (active_nodes.is_inactive(n_nghbr[idx1])) {
1478  // Recompute the minimum mindist[idx1] and n_nghbr[idx1].
1479  n_nghbr[idx1] = j = active_nodes.succ[idx1]; // exists, maximally N-1
1480  switch (method) {
1481  case METHOD_VECTOR_WARD:
1482  min = dist.ward(idx1, j);
1483  for (j = active_nodes.succ[j]; j < N; j = active_nodes.succ[j]) {
1484  t_float const tmp = dist.ward(idx1, j);
1485  if (tmp < min) {
1486  min = tmp;
1487  n_nghbr[idx1] = j;
1488  }
1489  }
1490  break;
1491  default:
1492  min = dist.template sqeuclidean<true>(idx1, j);
1493  for (j = active_nodes.succ[j]; j < N; j = active_nodes.succ[j]) {
1494  t_float const tmp = dist.template sqeuclidean<true>(idx1, j);
1495  if (tmp < min) {
1496  min = tmp;
1497  n_nghbr[idx1] = j;
1498  }
1499  }
1500  }
1501  /* Update the heap with the new true minimum and search for the (possibly
1502  different) minimal entry. */
1503  nn_distances.update_geq(idx1, min);
1504  idx1 = nn_distances.argmin();
1505  }
1506 
1507  nn_distances.heap_pop(); // Remove the current minimum from the heap.
1508  idx2 = n_nghbr[idx1];
1509 
1510  // Write the newly found minimal pair of nodes to the output array.
1511  node1 = row_repr[idx1];
1512  node2 = row_repr[idx2];
1513 
1514  Z2.append(node1, node2, mindist[idx1]);
1515 
1516  switch (method) {
1517  case METHOD_VECTOR_WARD:
1518  case METHOD_VECTOR_CENTROID: dist.merge_inplace(idx1, idx2); break;
1519  case METHOD_VECTOR_MEDIAN: dist.merge_inplace_weighted(idx1, idx2); break;
1520  default: throw std::runtime_error(std::string("Invalid method."));
1521  }
1522 
1523  // Index idx2 now represents the new (merged) node with label N+i.
1524  row_repr[idx2] = N + i;
1525  // Remove idx1 from the list of active indices (active_nodes).
1526  active_nodes.remove(idx1); // TBD later!!!
1527 
1528  // Update the distance matrix
1529  switch (method) {
1530  case METHOD_VECTOR_WARD:
1531  /*
1532  Ward linkage.
1533 
1534  Shorter and longer distances can occur, not smaller than min(d1,d2)
1535  but maybe bigger than max(d1,d2).
1536  */
1537  // Update the distance matrix in the range [start, idx1).
1538  for (j = active_nodes.start; j < idx1; j = active_nodes.succ[j]) {
1539  if (n_nghbr[j] == idx2) {
1540  n_nghbr[j] = idx1; // invalidate
1541  }
1542  }
1543  // Update the distance matrix in the range (idx1, idx2).
1544  for (; j < idx2; j = active_nodes.succ[j]) {
1545  t_float const tmp = dist.ward(j, idx2);
1546  if (tmp < mindist[j]) {
1547  nn_distances.update_leq(j, tmp);
1548  n_nghbr[j] = idx2;
1549  } else if (n_nghbr[j] == idx2) {
1550  n_nghbr[j] = idx1; // invalidate
1551  }
1552  }
1553  // Find the nearest neighbor for idx2.
1554  if (idx2 < N_1) {
1555  n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1556  min = dist.ward(idx2, j);
1557  for (j = active_nodes.succ[j]; j < N; j = active_nodes.succ[j]) {
1558  t_float const tmp = dist.ward(idx2, j);
1559  if (tmp < min) {
1560  min = tmp;
1561  n_nghbr[idx2] = j;
1562  }
1563  }
1564  nn_distances.update(idx2, min);
1565  }
1566  break;
1567 
1568  default:
1569  /*
1570  Centroid and median linkage.
1571 
1572  Shorter and longer distances can occur, not bigger than max(d1,d2)
1573  but maybe smaller than min(d1,d2).
1574  */
1575  for (j = active_nodes.start; j < idx2; j = active_nodes.succ[j]) {
1576  t_float const tmp = dist.template sqeuclidean<true>(j, idx2);
1577  if (tmp < mindist[j]) {
1578  nn_distances.update_leq(j, tmp);
1579  n_nghbr[j] = idx2;
1580  } else if (n_nghbr[j] == idx2)
1581  n_nghbr[j] = idx1; // invalidate
1582  }
1583  // Find the nearest neighbor for idx2.
1584  if (idx2 < N_1) {
1585  n_nghbr[idx2] = j = active_nodes.succ[idx2]; // exists, maximally N-1
1586  min = dist.template sqeuclidean<true>(idx2, j);
1587  for (j = active_nodes.succ[j]; j < N; j = active_nodes.succ[j]) {
1588  t_float const tmp = dist.template sqeuclidean<true>(idx2, j);
1589  if (tmp < min) {
1590  min = tmp;
1591  n_nghbr[idx2] = j;
1592  }
1593  }
1594  nn_distances.update(idx2, min);
1595  }
1596  }
1597  }
1598 }
1599 
1600 template <method_codes_vector method, typename t_dissimilarity>
1601 static void generic_linkage_vector_alternative(const t_index N, t_dissimilarity &dist, cluster_result &Z2)
1602 {
1603  /*
1604  N: integer, number of data points
1605  dist: function pointer to the metric
1606  Z2: output data structure
1607 
1608  This algorithm is valid for the distance update methods
1609  "Ward", "centroid" and "median" only!
1610  */
1611  const t_index N_1 = N - 1;
1612  t_index i, j = 0; // loop variables
1613  t_index idx1, idx2; // row and column indices
1614 
1615  auto_array_ptr<t_index> n_nghbr(2 * N - 2); // array of nearest neighbors
1616  auto_array_ptr<t_float> mindist(2 * N - 2); // distances to the nearest neighbors
1617 
1618  doubly_linked_list active_nodes(N + N_1);
1619  binary_min_heap nn_distances(&*mindist, N_1, 2 * N - 2,
1620  1); // minimum heap
1621  // structure for the distance to the nearest neighbor of each point
1622 
1623  t_float min; // minimum for nearest-neighbor searches
1624 
1625  // Initialize the minimal distances:
1626  // Find the nearest neighbor of each point.
1627  // n_nghbr[i] = argmin_{j>i} D(i,j) for i in range(N-1)
1628  for (i = 1; i < N; ++i) {
1629  min = std::numeric_limits<t_float>::infinity();
1630  t_index idx;
1631  for (idx = j = 0; j < i; ++j) {
1632  t_float tmp;
1633  switch (method) {
1634  case METHOD_VECTOR_WARD: tmp = dist.ward_initial(i, j); break;
1635  default: tmp = dist.template sqeuclidean<true>(i, j);
1636  }
1637  if (tmp < min) {
1638  min = tmp;
1639  idx = j;
1640  }
1641  }
1642  switch (method) {
1643  case METHOD_VECTOR_WARD: mindist[i] = t_dissimilarity::ward_initial_conversion(min); break;
1644  default: mindist[i] = min;
1645  }
1646  n_nghbr[i] = idx;
1647  }
1648 
1649  // Put the minimal distances into a heap structure to make the repeated
1650  // global minimum searches fast.
1651  nn_distances.heapify();
1652 
1653  // Main loop: We have N-1 merging steps.
1654  for (i = N; i < N + N_1; ++i) {
1655  /*
1656  The bookkeeping is different from the "stored matrix approach" algorithm
1657  generic_linkage.
1658 
1659  mindist[i] stores a lower bound on the minimum distance of the point i to
1660  all points of *lower* index:
1661 
1662  mindist[i] ≥ min_{j<i} D(i,j)
1663 
1664  Moreover, new nodes do not re-use one of the old indices, but they are
1665  given a new, unique index (SciPy convention: initial nodes are 0,…,N−1,
1666  new nodes are N,…,2N−2).
1667 
1668  Invalid nearest neighbors are not recognized by the fact that the stored
1669  distance is smaller than the actual distance, but the list active_nodes
1670  maintains a flag whether a node is inactive. If n_nghbr[i] points to an
1671  active node, the entries nn_distances[i] and n_nghbr[i] are valid,
1672  otherwise they must be recomputed.
1673  */
1674  idx1 = nn_distances.argmin();
1675  while (active_nodes.is_inactive(n_nghbr[idx1])) {
1676  // Recompute the minimum mindist[idx1] and n_nghbr[idx1].
1677  n_nghbr[idx1] = j = active_nodes.start;
1678  switch (method) {
1679  case METHOD_VECTOR_WARD:
1680  min = dist.ward_extended(idx1, j);
1681  for (j = active_nodes.succ[j]; j < idx1; j = active_nodes.succ[j]) {
1682  t_float tmp = dist.ward_extended(idx1, j);
1683  if (tmp < min) {
1684  min = tmp;
1685  n_nghbr[idx1] = j;
1686  }
1687  }
1688  break;
1689  default:
1690  min = dist.sqeuclidean_extended(idx1, j);
1691  for (j = active_nodes.succ[j]; j < idx1; j = active_nodes.succ[j]) {
1692  t_float const tmp = dist.sqeuclidean_extended(idx1, j);
1693  if (tmp < min) {
1694  min = tmp;
1695  n_nghbr[idx1] = j;
1696  }
1697  }
1698  }
1699  /* Update the heap with the new true minimum and search for the (possibly
1700  different) minimal entry. */
1701  nn_distances.update_geq(idx1, min);
1702  idx1 = nn_distances.argmin();
1703  }
1704 
1705  idx2 = n_nghbr[idx1];
1706  active_nodes.remove(idx1);
1707  active_nodes.remove(idx2);
1708 
1709  Z2.append(idx1, idx2, mindist[idx1]);
1710 
1711  if (i < 2 * N_1) {
1712  switch (method) {
1713  case METHOD_VECTOR_WARD:
1714  case METHOD_VECTOR_CENTROID: dist.merge(idx1, idx2, i); break;
1715 
1716  case METHOD_VECTOR_MEDIAN: dist.merge_weighted(idx1, idx2, i); break;
1717 
1718  default: throw std::runtime_error(std::string("Invalid method."));
1719  }
1720 
1721  n_nghbr[i] = active_nodes.start;
1722  if (method == METHOD_VECTOR_WARD) {
1723  /*
1724  Ward linkage.
1725 
1726  Shorter and longer distances can occur, not smaller than min(d1,d2)
1727  but maybe bigger than max(d1,d2).
1728  */
1729  min = dist.ward_extended(active_nodes.start, i);
1730  for (j = active_nodes.succ[active_nodes.start]; j < i; j = active_nodes.succ[j]) {
1731  t_float tmp = dist.ward_extended(j, i);
1732  if (tmp < min) {
1733  min = tmp;
1734  n_nghbr[i] = j;
1735  }
1736  }
1737  } else {
1738  /*
1739  Centroid and median linkage.
1740 
1741  Shorter and longer distances can occur, not bigger than max(d1,d2)
1742  but maybe smaller than min(d1,d2).
1743  */
1744  min = dist.sqeuclidean_extended(active_nodes.start, i);
1745  for (j = active_nodes.succ[active_nodes.start]; j < i; j = active_nodes.succ[j]) {
1746  t_float tmp = dist.sqeuclidean_extended(j, i);
1747  if (tmp < min) {
1748  min = tmp;
1749  n_nghbr[i] = j;
1750  }
1751  }
1752  }
1753  if (idx2 < active_nodes.start) {
1754  nn_distances.remove(active_nodes.start);
1755  } else {
1756  nn_distances.remove(idx2);
1757  }
1758  nn_distances.replace(idx1, i, min);
1759  }
1760  }
1761 }
1762 
1763 #if HAVE_VISIBILITY
1764 #pragma GCC visibility pop
1765 #endif
auto_array_ptr
Definition: fastcluster_dm.cxx:177
METHOD_VECTOR_SINGLE
@ METHOD_VECTOR_SINGLE
Definition: fastcluster_dm.cxx:166
node::dist
t_float dist
Definition: fastcluster_dm.cxx:215
MIN_METHOD_VECTOR_CODE
@ MIN_METHOD_VECTOR_CODE
Definition: fastcluster_dm.cxx:171
auto_array_ptr::init
void init(index const size)
Definition: fastcluster_dm.cxx:200
union_find::Find
t_index Find(t_index idx) const
Definition: fastcluster_dm.cxx:363
cluster_result::append
void append(const t_index node1, const t_index node2, const t_float dist)
Definition: fastcluster_dm.cxx:231
cluster_result::operator[]
node * operator[](const t_index idx) const
Definition: fastcluster_dm.cxx:239
binary_min_heap::update_leq
void update_leq(const t_index idx, const t_float val) const
Definition: fastcluster_dm.cxx:889
cluster_result::power
void power(const t_float p) const
Definition: fastcluster_dm.cxx:269
method_codes
method_codes
Definition: fastcluster_dm.cxx:148
nan_error
Definition: fastcluster_dm.cxx:386
operator<
bool operator<(const node a, const node b)
Definition: fastcluster_dm.cxx:218
doubly_linked_list::~doubly_linked_list
~doubly_linked_list()
Definition: fastcluster_dm.cxx:323
METHOD_METR_SINGLE
@ METHOD_METR_SINGLE
Definition: fastcluster_dm.cxx:150
union_find::Union
void Union(const t_index node1, const t_index node2)
Definition: fastcluster_dm.cxx:382
binary_min_heap::remove
void remove(t_index idx)
Definition: fastcluster_dm.cxx:855
auto_array_ptr::init
void init(index const size, value const val)
Definition: fastcluster_dm.cxx:205
node
Definition: fastcluster_dm.cxx:213
fc_isnan
bool fc_isnan(double x)
Definition: fastcluster.cxx:17
doubly_linked_list::start
t_index start
Definition: fastcluster_dm.cxx:303
t_index
int_fast32_t t_index
Definition: fastcluster_dm.cxx:129
METHOD_METR_WEIGHTED
@ METHOD_METR_WEIGHTED
Definition: fastcluster_dm.cxx:153
cluster_result
Definition: fastcluster_dm.cxx:223
doubly_linked_list
Definition: fastcluster_dm.cxx:292
MAX_METHOD_CODE
@ MAX_METHOD_CODE
Definition: fastcluster_dm.cxx:161
binary_min_heap::replace
void replace(const t_index idxold, const t_index idxnew, const t_float val)
Definition: fastcluster_dm.cxx:868
METHOD_VECTOR_CENTROID
@ METHOD_VECTOR_CENTROID
Definition: fastcluster_dm.cxx:168
doubly_linked_list::is_inactive
bool is_inactive(t_index idx) const
Definition: fastcluster_dm.cxx:337
binary_min_heap::argmin
t_index argmin() const
Definition: fastcluster_dm.cxx:840
union_find
Definition: fastcluster_dm.cxx:355
cluster_result::sqrt
void sqrt() const
Definition: fastcluster_dm.cxx:244
auto_array_ptr::auto_array_ptr
auto_array_ptr(index const size, value const val)
Definition: fastcluster_dm.cxx:189
binary_min_heap::heapify
void heapify()
Definition: fastcluster_dm.cxx:825
binary_min_heap::binary_min_heap
binary_min_heap(t_float *const A_, const t_index size1, const t_index size2, const t_index start)
Definition: fastcluster_dm.cxx:813
MAX_METHOD_VECTOR_CODE
@ MAX_METHOD_VECTOR_CODE
Definition: fastcluster_dm.cxx:172
METHOD_VECTOR_MEDIAN
@ METHOD_VECTOR_MEDIAN
Definition: fastcluster_dm.cxx:169
METHOD_METR_MEDIAN
@ METHOD_METR_MEDIAN
Definition: fastcluster_dm.cxx:157
doubly_linked_list::doubly_linked_list
doubly_linked_list(const t_index size)
Definition: fastcluster_dm.cxx:311
my_pow
#define my_pow
Definition: fastcluster_dm.cxx:266
auto_array_ptr::~auto_array_ptr
~auto_array_ptr()
Definition: fastcluster_dm.cxx:193
cluster_result::divide
void divide(const t_float denom) const
Definition: fastcluster_dm.cxx:284
binary_min_heap::heap_pop
void heap_pop()
Definition: fastcluster_dm.cxx:846
auto_array_ptr::auto_array_ptr
auto_array_ptr(index const size)
Definition: fastcluster_dm.cxx:185
auto_array_ptr::free
void free()
Definition: fastcluster_dm.cxx:194
t_float
double t_float
Definition: fastcluster_dm.cxx:141
METHOD_VECTOR_WARD
@ METHOD_VECTOR_WARD
Definition: fastcluster_dm.cxx:167
METHOD_METR_COMPLETE
@ METHOD_METR_COMPLETE
Definition: fastcluster_dm.cxx:151
method_codes_vector
method_codes_vector
Definition: fastcluster_dm.cxx:164
cluster_result::plusone
void plusone(const t_float) const
Definition: fastcluster_dm.cxx:277
union_find::union_find
union_find(const t_index size)
Definition: fastcluster_dm.cxx:361
cluster_result::sqrt
void sqrt(const t_float) const
Definition: fastcluster_dm.cxx:251
cluster_result::cluster_result
cluster_result(const t_index size)
Definition: fastcluster_dm.cxx:229
node::node1
t_index node1
Definition: fastcluster_dm.cxx:214
FILL_N
#define FILL_N
Definition: fastcluster_dm.cxx:94
binary_min_heap::update_geq
void update_geq(const t_index idx, const t_float val) const
Definition: fastcluster_dm.cxx:896
binary_min_heap::binary_min_heap
binary_min_heap(t_float *const A_, const t_index size_)
Definition: fastcluster_dm.cxx:806
cluster_result::sqrtdouble
void sqrtdouble(const t_float) const
Definition: fastcluster_dm.cxx:256
MIN_METHOD_CODE
@ MIN_METHOD_CODE
Definition: fastcluster_dm.cxx:160
METHOD_METR_CENTROID
@ METHOD_METR_CENTROID
Definition: fastcluster_dm.cxx:156
METHOD_METR_WARD_D
@ METHOD_METR_WARD_D
Definition: fastcluster_dm.cxx:155
node::node2
t_index node2
Definition: fastcluster_dm.cxx:214
size_
#define size_(r_)
Definition: fastcluster_R_dm.cxx:65
METHOD_METR_AVERAGE
@ METHOD_METR_AVERAGE
Definition: fastcluster_dm.cxx:152
doubly_linked_list::succ
auto_array_ptr< t_index > succ
Definition: fastcluster_dm.cxx:304
METHOD_METR_WARD
@ METHOD_METR_WARD
Definition: fastcluster_dm.cxx:154
binary_min_heap::update
void update(const t_index idx, const t_float val) const
Definition: fastcluster_dm.cxx:878
binary_min_heap::~binary_min_heap
~binary_min_heap()
Definition: fastcluster_dm.cxx:823
METHOD_METR_WARD_D2
@ METHOD_METR_WARD_D2
Definition: fastcluster_dm.cxx:158
D_
#define D_(r_, c_)
Definition: fastcluster_dm.cxx:343
binary_min_heap
Definition: fastcluster_dm.cxx:772
doubly_linked_list::remove
void remove(const t_index idx)
Definition: fastcluster_dm.cxx:325
c
constexpr auto c
Definition: AtRadialChargeModel.cxx:20
auto_array_ptr::auto_array_ptr
auto_array_ptr()
Definition: fastcluster_dm.cxx:183