-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkmeans_simple_impl.c
92 lines (86 loc) · 3.75 KB
/
kmeans_simple_impl.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
#include <float.h>
#include <math.h>
#include "kmeans.h"
/**
* Assigns each point in the dataset to a cluster based on the distance from that cluster.
*
* The return value indicates how many points were assigned to a _different_ cluster
* in this assignment process: this indicates how close the algorithm is to completion.
* When the return value is zero, no points changed cluster so the clustering is complete.
*
* @param dataset set of all points with current cluster assignments
* @param num_points number of points in the dataset
* @param centroids array that holds the current centroids
* @param num_clusters number of clusters - hence size of the centroids array
* @return the number of points for which the cluster assignment was changed
*/
int assign_clusters(struct point* dataset, int num_points, struct point *centroids, int num_clusters)
{
#ifdef DEBUG
printf("\nStarting assignment phase:\n");
#endif
int cluster_changes = 0;
for (int n = 0; n < num_points; ++n) {
double min_distance = DBL_MAX; // init the min distance to a big number
int closest_cluster = -1;
for (int k = 0; k < num_clusters; ++k) {
// calc the distance passing pointers to points since the distance does not modify them
double distance_from_centroid = euclidean_distance(&dataset[n], ¢roids[k]);
if (distance_from_centroid < min_distance) {
min_distance = distance_from_centroid;
closest_cluster = k;
}
}
// if the point was not already in the closest cluster, move it there and count changes
if (dataset[n].cluster != closest_cluster) {
dataset[n].cluster = closest_cluster;
cluster_changes++;
#ifdef TRACE
debug_assignment(&dataset[n], closest_cluster, ¢roids[closest_cluster], min_distance);
#endif
}
}
return cluster_changes;
}
/**
* Calculates new centroids for the clusters of the given dataset by finding the
* mean x and y coordinates of the current members of the cluster for each cluster.
*
* The centroids are set in the array passed in, which is expected to be pre-allocated
* and contain the previous centroids: these are overwritten by the new values.
*
* @param dataset set of all points with current cluster assigments
* @param num_points number of points in the dataset
* @param centroids array to hold the centroids - already allocated
* @param num_clusters number of clusters - hence size of the centroids array
*/
void calculate_centroids(struct point* dataset, int num_points, struct point *centroids, int num_clusters)
{
double sum_of_x_per_cluster[num_clusters];
double sum_of_y_per_cluster[num_clusters];
int num_points_in_cluster[num_clusters];
for (int k = 0; k < num_clusters; ++k) {
sum_of_x_per_cluster[k] = 0.0;
sum_of_y_per_cluster[k] = 0.0;
num_points_in_cluster[k] = 0;
}
// loop over all points in the database and sum up
// the x coords of clusters to which each belongs
for (int n = 0; n < num_points; ++n) {
// use pointer to struct to avoid creating unnecessary copy in memory
struct point *p = &dataset[n];
int k = p->cluster;
sum_of_x_per_cluster[k] += p->x;
sum_of_y_per_cluster[k] += p->y;
// count the points in the cluster to get a mean later
num_points_in_cluster[k]++;
}
// the new centroids are at the mean x and y coords of the clusters
for (int k = 0; k < num_clusters; ++k) {
struct point new_centroid;
// mean x, mean y => new centroid
new_centroid.x = sum_of_x_per_cluster[k] / num_points_in_cluster[k];
new_centroid.y = sum_of_y_per_cluster[k] / num_points_in_cluster[k];
centroids[k] = new_centroid;
}
}