programming-examples/java/Implementation of Agglomerative Algorithm.java
2019-11-18 13:43:20 +01:00

188 lines
5.7 KiB
Java

package tutorial.clustering;
import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
import de.lmu.ifi.dbs.elki.logging.Logging;
import de.lmu.ifi.dbs.elki.result.Result;
public class NaiveAgglomerativeHierarchicalClustering<O, D extends NumberDistance<D, ?>>
extends AbstractDistanceBasedAlgorithm<O, D, Result>
{
protected NaiveAgglomerativeHierarchicalClustering(DistanceFunction<? super O, D> distanceFunction)
{
super(distanceFunction);
// TODO Auto-generated constructor stub
}
public TypeInformation[] getInputTypeRestriction()
{
// TODO Auto-generated method stub
return null;
}
protected Logging getLogger()
{
// TODO Auto-generated method stub
return null;
}
}
/**
* Static class logger.
*/
private static final Logging LOG = Logging.getLogger(NaiveAgglomerativeHierarchicalClustering.class);
protected Logging getLogger()
{
return LOG;
}
public TypeInformation[] getInputTypeRestriction()
{
return TypeUtil.array(
getDistanceFunction().getInputTypeRestriction()
);
}
// The run method
public Result run(Database db, Relation<O> relation)
{
return null;
}
DistanceQuery<O, D> dq = db.getDistanceQuery(relation, getDistanceFunction());
ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
final int size = ids.size();
LOG.verbose("Notice: SLINK is a much faster algorithm for single-linkage clustering!");
Computing the distance matrix
double[][] matrix = new double[size][size];
DBIDArrayIter ix = ids.iter(), iy = ids.iter();
for (int x = 0; ix.valid(); x++, ix.advance())
{
iy.seek(0);
for (int y = 0; y < x; y++, iy.advance())
{
final double dist = dq.distance(ix, iy).doubleValue();
matrix[x][y] = dist;
matrix[y][x] = dist;
}
}
Algorithm main loop
final int stop = size - numclusters;
FiniteProgress prog = LOG.isVerbose() ?
new FiniteProgress("Agglomerative clustering", stop, LOG)
: null;
for (int i = 0; i < stop; i++)
{
// TODO: find clusters to merge
// TODO: store the merge in auxillary data
// TODO: update distance matrix
if (prog != null)
{
prog.incrementProcessed(LOG);
}
}
if (prog != null)
{
prog.ensureCompleted(LOG);
}
double min = Double.POSITIVE_INFINITY;
int minx = -1, miny = -1;
for (int x = 0; x < size; x++)
{
if (height[x] < Double.POSITIVE_INFINITY)
{
continue;
}
for (int y = 0; y < x; y++)
{
if (height[y] < Double.POSITIVE_INFINITY)
{
continue;
}
if (matrix[x][y] < min)
{
min = matrix[x][y];
minx = x;
miny = y;
}
}
}
// Avoid allocating memory, by reusing existing iterators:
ix.seek(minx);
iy.seek(miny);
// Perform merge in data structure: x -> y
// Since y < x, prefer keeping y, dropping x.
height[minx] = min;
parent.set(minx, iy);
// Merge into cluster
ModifiableDBIDs cx = clusters.get(minx);
ModifiableDBIDs cy = clusters.get(miny);
if (cy == null)
{
cy = DBIDUtil.newHashSet();
cy.add(iy);
}
if (cx == null)
{
cy.add(ix);
}
else
{
cy.addDBIDs(cx);
clusters.remove(minx);
}
clusters.put(miny, cy);
// Update distance matrix for y:
for (int j = 0; j < size; j++)
{
matrix[j][miny] = Math.min(matrix[j][minx], matrix[j][miny]);
matrix[miny][j] = Math.min(matrix[minx][j], matrix[miny][j]);
}
Returning a Clustering
final Clustering<Model> dendrogram = new Clustering<>(
"Hierarchical-Clustering", "hierarchical-clustering");
for (int x = 0; x < size; x++)
{
if (height[x] < Double.POSITIVE_INFINITY)
{
continue;
}
DBIDs cids = clusters.get(x);
// For singleton objects, this may be null.
if (cids == null)
{
ix.seek(x);
cids = DBIDUtil.deref(ix);
}
Cluster<Model> cluster = new Cluster<>("Cluster", cids);
dendrogram.addToplevelCluster(cluster);
}
return dendrogram;
Updating the constructor
/**
* Threshold, how many clusters to extract.
*/
int numclusters;
public NaiveAgglomerativeHierarchicalClustering(
DistanceFunction<? super O, D> distanceFunction,
int numclusters)
{
super(distanceFunction);
this.numclusters = numclusters;
}
Adding a Parameterizer
public static class Parameterizer<O, D extends NumberDistance<D, ?>>
extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D>
{
int numclusters = 0;
protected void makeOptions(Parameterization config)
{
super.makeOptions(config);
IntParameter numclustersP = new IntParameter(SLINK.Parameterizer.SLINK_MINCLUSTERS_ID);
numclustersP.addConstraint(new GreaterEqualConstraint(1));
if (config.grab(numclustersP))
{
numclusters = numclustersP.intValue();
}
}
protected NaiveAgglomerativeHierarchicalClustering<O, D> makeInstance()
{
return new NaiveAgglomerativeHierarchicalClustering<>(distanceFunction, numclusters);
}
}