ATTPCROOT  0.3.0-alpha
A ROOT-based framework for analyzing data from active target detectors
cluster.cxx
Go to the documentation of this file.
1 //
2 // cluster.cpp
3 // Functions for triplet clustering and for propagating
4 // the triplet cluster labels to points
5 //
6 // Author: Jens Wilberg, Lukas Aymans, Christoph Dalitz
7 // Date: 2019-04-02
8 // License: see ../LICENSE
9 //
10 
11 #include "cluster.h"
12 
13 #include "pointcloud.h" // for PointCloud, Point
14 #include "triplet.h" // for triplet, ScaleTripletMetric
15 
16 #include <algorithm>
17 #include <cmath>
18 #include <iostream> // for operator<<, basic_ostream, endl, ofs...
19 #include <iterator> // for distance
20 #include <memory> // for allocator_traits<>::value_type
21 #include <set> // for set, operator!=, operator==, set<>::...
22 
23 #include "hclust/fastcluster.h"
24 
25 // compute mean of *a* with size *m*
26 double mean(const double *a, size_t m)
27 {
28  double sum = 0;
29  for (size_t i = 0; i < m; ++i) {
30  sum += a[i];
31  }
32  return sum / m;
33 }
34 
35 // compute standard deviation of *a* with size *m*
36 double sd(const double *a, size_t m)
37 {
38  double sum = 0, result;
39  double mean_val = mean(a, m);
40  for (size_t k = 0; k < m; ++k) {
41  double tmp = mean_val - a[k];
42  sum += (tmp * tmp);
43  }
44  result = (1.0 / (m - 1.0)) * sum;
45  return std::sqrt(result);
46 }
47 
48 //-------------------------------------------------------------------
49 // computation of condensed distance matrix.
50 // The distance matrix is computed from the triplets in *triplets*
51 // and saved in *result*. *triplet_metric* is used as distance metric.
52 //-------------------------------------------------------------------
53 void calculate_distance_matrix(const std::vector<triplet> &triplets, const PointCloud &cloud, double *result,
54  ScaleTripletMetric &triplet_metric)
55 {
56  size_t const triplet_size = triplets.size();
57  size_t k = 0;
58 
59  for (size_t i = 0; i < triplet_size; ++i) {
60  for (size_t j = i + 1; j < triplet_size; j++) {
61  result[k++] = triplet_metric(triplets[i], triplets[j]);
62  }
63  }
64 }
65 
66 //-------------------------------------------------------------------
67 // Computation of the clustering.
68 // The triplets in *triplets* are clustered by the fastcluster algorithm
69 // and the result is returned as cluster_group. *t* is the cut distance
70 // and *triplet_metric* is the distance metric for the triplets.
71 // *opt_verbose* is the verbosity level for debug outputs. the clustering
72 // is returned in *result*.
73 //-------------------------------------------------------------------
74 void compute_hc(const PointCloud &cloud, cluster_group &result, const std::vector<triplet> &triplets, double s,
75  double t, bool tauto, double dmax, bool is_dmax, Linkage method, int opt_verbose)
76 {
77  const size_t triplet_size = triplets.size();
78  size_t k, cluster_size;
80 
81  if (triplet_size < 3) {
82  // if no triplets are generated
83  return;
84  }
85  // choose linkage method
86  switch (method) {
87  case SINGLE: link = HCLUST_METHOD_SINGLE; break;
88  case COMPLETE: link = HCLUST_METHOD_COMPLETE; break;
89  case AVERAGE: link = HCLUST_METHOD_AVERAGE; break;
90  }
91 
92  double *distance_matrix = new double[(triplet_size * (triplet_size - 1)) / 2];
93  double *cdists = new double[triplet_size - 1];
94  int *merge = new int[2 * (triplet_size - 1)], *labels = new int[triplet_size];
95  ScaleTripletMetric metric(s);
96  calculate_distance_matrix(triplets, cloud, distance_matrix, metric);
97 
98  hclust_fast(triplet_size, distance_matrix, link, merge, cdists);
99 
100  // splitting the dendrogram into clusters
101  if (tauto) {
102  // automatic stopping criterion where cdist is unexpected large
103  for (k = (triplet_size - 1) / 2; k < (triplet_size - 1); ++k) {
104  if ((cdists[k - 1] > 0.0 || cdists[k] > 1.0e-8) && (cdists[k] > cdists[k - 1] + 2 * sd(cdists, k + 1))) {
105  break;
106  }
107  }
108  if (opt_verbose) {
109  double automatic_t;
110  double prev_cdist = (k > 0) ? cdists[k - 1] : 0.0;
111  if (k < (triplet_size - 1)) {
112  automatic_t = (prev_cdist + cdists[k]) / 2.0;
113  } else {
114  automatic_t = cdists[k - 1];
115  }
116  std::cout << "[Info] optimal cdist threshold: " << automatic_t << std::endl;
117  }
118  } else {
119  // fixed threshold t
120  for (k = 0; k < (triplet_size - 1); ++k) {
121  if (cdists[k] >= t) {
122  break;
123  }
124  }
125  }
126  cluster_size = triplet_size - k;
127  cutree_k(triplet_size, merge, cluster_size, labels);
128 
129  // generate clusters
130  for (size_t i = 0; i < cluster_size; ++i) {
131  cluster_t new_cluster;
132  result.push_back(new_cluster);
133  }
134 
135  for (size_t i = 0; i < triplet_size; ++i) {
136  result[labels[i]].push_back(i);
137  }
138 
139  if (opt_verbose > 1) {
140  // write debug file
141  const char *fname = "debug_cdist.csv";
142  std::ofstream of(fname);
143  of << std::fixed; // set float style
144  if (of.is_open()) {
145  for (size_t i = 0; i < (triplet_size - 1); ++i) {
146  of << cdists[i] << std::endl;
147  }
148  } else {
149  std::cerr << "[Error] could not write file '" << fname << "'\n";
150  }
151  of.close();
152  }
153 
154  // cleanup
155  delete[] distance_matrix;
156  delete[] cdists;
157  delete[] merge;
158  delete[] labels;
159 }
160 
161 //-------------------------------------------------------------------
162 // Remove all clusters in *cl_group* which contains less then *m*
163 // triplets. *cl_group* will be modified.
164 //-------------------------------------------------------------------
165 void cleanup_cluster_group(cluster_group &cl_group, size_t m, int opt_verbose)
166 {
167  size_t old_size = cl_group.size();
168  cluster_group::iterator it = cl_group.begin();
169  while (it != cl_group.end()) {
170  if (it->size() < m) {
171  it = cl_group.erase(it);
172  } else {
173  ++it;
174  }
175  }
176  if (opt_verbose > 0) {
177  std::cout << "[Info] in pruning removed clusters: " << old_size - cl_group.size() << std::endl;
178  }
179 }
180 
181 //-------------------------------------------------------------------
182 // Convert the triplet indices ind *cl_group* to point indices.
183 // *triplets* contains all triplets and *cl_group* will be modified.
184 //-------------------------------------------------------------------
185 void cluster_triplets_to_points(const std::vector<triplet> &triplets, cluster_group &cl_group)
186 {
187  for (size_t i = 0; i < cl_group.size(); ++i) {
188  cluster_t point_indices, &current_cluster = cl_group[i];
189  for (std::vector<size_t>::const_iterator triplet_index = current_cluster.begin();
190  triplet_index < current_cluster.end(); ++triplet_index) {
191  const triplet &current_triplet = triplets[*triplet_index];
192  point_indices.push_back(current_triplet.point_index_a);
193  point_indices.push_back(current_triplet.point_index_b);
194  point_indices.push_back(current_triplet.point_index_c);
195  }
196  // sort point-indices and remove duplicates
197  std::sort(point_indices.begin(), point_indices.end());
198  std::vector<size_t>::iterator new_end = std::unique(point_indices.begin(), point_indices.end());
199  point_indices.resize(std::distance(point_indices.begin(), new_end));
200  // replace triplet cluster with point cluster
201  current_cluster = point_indices;
202  }
203 }
204 
205 //-------------------------------------------------------------------
206 // Adds the cluster ids to the points in *cloud*
207 // *cl_group* contains the clusters with the point indices. For every
208 // point the corresponding cluster id is saved. For gnuplot the points
209 // which overlap between multiple clusteres are saved in seperate
210 // clusters in *cl_group*
211 //-------------------------------------------------------------------
212 void add_clusters(PointCloud &cloud, cluster_group &cl_group, bool gnuplot)
213 {
214  for (size_t i = 0; i < cl_group.size(); ++i) {
215  const std::vector<size_t> &current_cluster = cl_group[i];
216  for (std::vector<size_t>::const_iterator point_index = current_cluster.begin();
217  point_index != current_cluster.end(); ++point_index) {
218  cloud[*point_index].cluster_ids.insert(i);
219  }
220  }
221 
222  // add point indices to the corresponding cluster/vertex vector. this is for
223  // the gnuplot output
224  if (gnuplot) {
225  std::vector<cluster_t> verticies;
226  for (size_t i = 0; i < cloud.size(); ++i) {
227  const Point &p = cloud[i];
228  if (p.cluster_ids.size() > 1) {
229  // if the point is in multiple clusters add it to the corresponding
230  // vertex or create one if none exists
231  bool found = false;
232  for (std::vector<cluster_t>::iterator v = verticies.begin(); v != verticies.end(); ++v) {
233  if (cloud[v->at(0)].cluster_ids == p.cluster_ids) {
234  v->push_back(i);
235  found = true;
236  }
237  }
238  if (!found) {
239  cluster_t v;
240  v.push_back(i);
241  verticies.push_back(v);
242  }
243  // remove the point from all other clusters
244  for (std::set<size_t>::iterator it = p.cluster_ids.begin(); it != p.cluster_ids.end(); ++it) {
245  cluster_t &cluster = cl_group[*it];
246  cluster.erase(std::remove(cluster.begin(), cluster.end(), i), cluster.end());
247  }
248  }
249  }
250  // extend clusters with verticies
251  cl_group.reserve(cl_group.size() + verticies.size());
252  cl_group.insert(cl_group.end(), verticies.begin(), verticies.end());
253  }
254 }
calculate_distance_matrix
void calculate_distance_matrix(const std::vector< triplet > &triplets, const PointCloud &cloud, double *result, ScaleTripletMetric &triplet_metric)
Definition: cluster.cxx:53
Linkage
Linkage
Definition: util.h:13
compute_hc
void compute_hc(const PointCloud &cloud, cluster_group &result, const std::vector< triplet > &triplets, double s, double t, bool tauto, double dmax, bool is_dmax, Linkage method, int opt_verbose)
Definition: cluster.cxx:74
triplet::point_index_c
size_t point_index_c
Definition: triplet.h:23
HCLUST_METHOD_AVERAGE
@ HCLUST_METHOD_AVERAGE
Definition: fastcluster.h:68
AVERAGE
@ AVERAGE
Definition: util.h:13
fastcluster.h
hc::cluster
std::vector< size_t > cluster
Definition: hc.h:25
triplet
Definition: triplet.h:20
cutree_k
void cutree_k(int n, const int *merge, int nclust, int *labels)
Definition: fastcluster.cxx:38
triplet::point_index_b
size_t point_index_b
Definition: triplet.h:22
Point
Definition: main.cpp:71
hclust_fast
int hclust_fast(int n, double *distmat, int method, int *merge, double *height)
Definition: fastcluster.cxx:141
HCLUST_METHOD_COMPLETE
@ HCLUST_METHOD_COMPLETE
Definition: fastcluster.h:67
pointcloud.h
Point::cluster_ids
std::set< size_t > cluster_ids
Definition: pointcloud.h:24
triplet.h
triplet::point_index_a
size_t point_index_a
Definition: triplet.h:21
SINGLE
@ SINGLE
Definition: util.h:13
PointCloud
Definition: pointcloud.h:60
ScaleTripletMetric
Definition: triplet.h:32
COMPLETE
@ COMPLETE
Definition: util.h:13
sd
double sd(const double *a, size_t m)
Definition: cluster.cxx:36
hclust_fast_methods
hclust_fast_methods
Definition: fastcluster.h:65
cleanup_cluster_group
void cleanup_cluster_group(cluster_group &cl_group, size_t m, int opt_verbose)
Definition: cluster.cxx:165
cluster_triplets_to_points
void cluster_triplets_to_points(const std::vector< triplet > &triplets, cluster_group &cl_group)
Definition: cluster.cxx:185
add_clusters
void add_clusters(PointCloud &cloud, cluster_group &cl_group, bool gnuplot)
Definition: cluster.cxx:212
HCLUST_METHOD_SINGLE
@ HCLUST_METHOD_SINGLE
Definition: fastcluster.h:66
cluster_t
std::vector< size_t > cluster_t
Definition: cluster.h:21
cluster_group
std::vector< cluster_t > cluster_group
Definition: cluster.h:25
cluster.h
mean
double mean(const double *a, size_t m)
Definition: cluster.cxx:26