188 lines
5.7 KiB
Java
188 lines
5.7 KiB
Java
|
|
package tutorial.clustering;
|
|
import de.lmu.ifi.dbs.elki.algorithm.AbstractDistanceBasedAlgorithm;
|
|
import de.lmu.ifi.dbs.elki.data.type.TypeInformation;
|
|
import de.lmu.ifi.dbs.elki.distance.distancefunction.DistanceFunction;
|
|
import de.lmu.ifi.dbs.elki.distance.distancevalue.NumberDistance;
|
|
import de.lmu.ifi.dbs.elki.logging.Logging;
|
|
import de.lmu.ifi.dbs.elki.result.Result;
|
|
public class NaiveAgglomerativeHierarchicalClustering<O, D extends NumberDistance<D, ?>>
|
|
extends AbstractDistanceBasedAlgorithm<O, D, Result>
|
|
{
|
|
protected NaiveAgglomerativeHierarchicalClustering(DistanceFunction<? super O, D> distanceFunction)
|
|
{
|
|
super(distanceFunction);
|
|
// TODO Auto-generated constructor stub
|
|
}
|
|
public TypeInformation[] getInputTypeRestriction()
|
|
{
|
|
// TODO Auto-generated method stub
|
|
return null;
|
|
}
|
|
protected Logging getLogger()
|
|
{
|
|
// TODO Auto-generated method stub
|
|
return null;
|
|
}
|
|
}
|
|
/**
|
|
* Static class logger.
|
|
*/
|
|
private static final Logging LOG = Logging.getLogger(NaiveAgglomerativeHierarchicalClustering.class);
|
|
protected Logging getLogger()
|
|
{
|
|
return LOG;
|
|
}
|
|
public TypeInformation[] getInputTypeRestriction()
|
|
{
|
|
return TypeUtil.array(
|
|
getDistanceFunction().getInputTypeRestriction()
|
|
);
|
|
}
|
|
// The run method
|
|
public Result run(Database db, Relation<O> relation)
|
|
{
|
|
return null;
|
|
}
|
|
DistanceQuery<O, D> dq = db.getDistanceQuery(relation, getDistanceFunction());
|
|
ArrayDBIDs ids = DBIDUtil.ensureArray(relation.getDBIDs());
|
|
final int size = ids.size();
|
|
LOG.verbose("Notice: SLINK is a much faster algorithm for single-linkage clustering!");
|
|
Computing the distance matrix
|
|
double[][] matrix = new double[size][size];
|
|
DBIDArrayIter ix = ids.iter(), iy = ids.iter();
|
|
for (int x = 0; ix.valid(); x++, ix.advance())
|
|
{
|
|
iy.seek(0);
|
|
for (int y = 0; y < x; y++, iy.advance())
|
|
{
|
|
final double dist = dq.distance(ix, iy).doubleValue();
|
|
matrix[x][y] = dist;
|
|
matrix[y][x] = dist;
|
|
}
|
|
}
|
|
Algorithm main loop
|
|
final int stop = size - numclusters;
|
|
FiniteProgress prog = LOG.isVerbose() ?
|
|
new FiniteProgress("Agglomerative clustering", stop, LOG)
|
|
: null;
|
|
for (int i = 0; i < stop; i++)
|
|
{
|
|
// TODO: find clusters to merge
|
|
// TODO: store the merge in auxillary data
|
|
// TODO: update distance matrix
|
|
if (prog != null)
|
|
{
|
|
prog.incrementProcessed(LOG);
|
|
}
|
|
}
|
|
if (prog != null)
|
|
{
|
|
prog.ensureCompleted(LOG);
|
|
}
|
|
double min = Double.POSITIVE_INFINITY;
|
|
int minx = -1, miny = -1;
|
|
for (int x = 0; x < size; x++)
|
|
{
|
|
if (height[x] < Double.POSITIVE_INFINITY)
|
|
{
|
|
continue;
|
|
}
|
|
for (int y = 0; y < x; y++)
|
|
{
|
|
if (height[y] < Double.POSITIVE_INFINITY)
|
|
{
|
|
continue;
|
|
}
|
|
if (matrix[x][y] < min)
|
|
{
|
|
min = matrix[x][y];
|
|
minx = x;
|
|
miny = y;
|
|
}
|
|
}
|
|
}
|
|
// Avoid allocating memory, by reusing existing iterators:
|
|
ix.seek(minx);
|
|
iy.seek(miny);
|
|
// Perform merge in data structure: x -> y
|
|
// Since y < x, prefer keeping y, dropping x.
|
|
height[minx] = min;
|
|
parent.set(minx, iy);
|
|
// Merge into cluster
|
|
ModifiableDBIDs cx = clusters.get(minx);
|
|
ModifiableDBIDs cy = clusters.get(miny);
|
|
if (cy == null)
|
|
{
|
|
cy = DBIDUtil.newHashSet();
|
|
cy.add(iy);
|
|
}
|
|
if (cx == null)
|
|
{
|
|
cy.add(ix);
|
|
}
|
|
else
|
|
{
|
|
cy.addDBIDs(cx);
|
|
clusters.remove(minx);
|
|
}
|
|
clusters.put(miny, cy);
|
|
// Update distance matrix for y:
|
|
for (int j = 0; j < size; j++)
|
|
{
|
|
matrix[j][miny] = Math.min(matrix[j][minx], matrix[j][miny]);
|
|
matrix[miny][j] = Math.min(matrix[minx][j], matrix[miny][j]);
|
|
}
|
|
Returning a Clustering
|
|
final Clustering<Model> dendrogram = new Clustering<>(
|
|
"Hierarchical-Clustering", "hierarchical-clustering");
|
|
for (int x = 0; x < size; x++)
|
|
{
|
|
if (height[x] < Double.POSITIVE_INFINITY)
|
|
{
|
|
continue;
|
|
}
|
|
DBIDs cids = clusters.get(x);
|
|
// For singleton objects, this may be null.
|
|
if (cids == null)
|
|
{
|
|
ix.seek(x);
|
|
cids = DBIDUtil.deref(ix);
|
|
}
|
|
Cluster<Model> cluster = new Cluster<>("Cluster", cids);
|
|
dendrogram.addToplevelCluster(cluster);
|
|
}
|
|
return dendrogram;
|
|
Updating the constructor
|
|
/**
|
|
* Threshold, how many clusters to extract.
|
|
*/
|
|
int numclusters;
|
|
public NaiveAgglomerativeHierarchicalClustering(
|
|
DistanceFunction<? super O, D> distanceFunction,
|
|
int numclusters)
|
|
{
|
|
super(distanceFunction);
|
|
this.numclusters = numclusters;
|
|
}
|
|
Adding a Parameterizer
|
|
public static class Parameterizer<O, D extends NumberDistance<D, ?>>
|
|
extends AbstractDistanceBasedAlgorithm.Parameterizer<O, D>
|
|
{
|
|
int numclusters = 0;
|
|
protected void makeOptions(Parameterization config)
|
|
{
|
|
super.makeOptions(config);
|
|
IntParameter numclustersP = new IntParameter(SLINK.Parameterizer.SLINK_MINCLUSTERS_ID);
|
|
numclustersP.addConstraint(new GreaterEqualConstraint(1));
|
|
if (config.grab(numclustersP))
|
|
{
|
|
numclusters = numclustersP.intValue();
|
|
}
|
|
}
|
|
protected NaiveAgglomerativeHierarchicalClustering<O, D> makeInstance()
|
|
{
|
|
return new NaiveAgglomerativeHierarchicalClustering<>(distanceFunction, numclusters);
|
|
}
|
|
}
|