118 lines
2.8 KiB
Ruby
118 lines
2.8 KiB
Ruby
$:.unshift File.dirname($0)
|
|
require "set"
|
|
require "dbscan"
|
|
|
|
class SUBCLU
|
|
|
|
def initialize(measure)
|
|
@measure = measure
|
|
end
|
|
|
|
#cluster in the first dimension
|
|
def subclu1(db, eps, min_pts, dimension_blacklist)
|
|
|
|
c_and_s = Hash.new
|
|
attributes = []; db.first.each_index {|attribute| attributes.push Set.new([attribute])}
|
|
|
|
attributes.each do |attribute|
|
|
if !dimension_blacklist.include?(attribute.to_a.first)
|
|
dbscan = DBscan.new( @measure.new(attribute) )
|
|
c_a = dbscan.run(db, eps, min_pts)
|
|
|
|
if !c_a.empty?
|
|
c_and_s[ attribute ] = c_a
|
|
end
|
|
end
|
|
end
|
|
return c_and_s
|
|
end
|
|
|
|
def generate_candidate_subspaces(c_and_s, dimension_blacklist)
|
|
|
|
c_and_s_next = Hash.new
|
|
|
|
#create candidate subspaces
|
|
c_and_s.each_key do |s_1|
|
|
c_and_s.each_key do |s_2|
|
|
if (s_1-s_2).size == 1
|
|
subspace = s_1+s_2
|
|
if ( (dimension_blacklist - subspace.to_a).eql?(dimension_blacklist) && !filter_subspace?(subspace, c_and_s))
|
|
c_and_s_next[(subspace)] = [] # clusters in higher subspace aren't yet known
|
|
end
|
|
end
|
|
end
|
|
end
|
|
|
|
return c_and_s_next
|
|
end
|
|
|
|
#filter subspaces
|
|
def filter_subspace?(subspace, c_and_s)
|
|
subspace.each do |dim|
|
|
s_k = subspace - [dim]
|
|
if c_and_s[s_k] == nil || c_and_s[s_k].empty?
|
|
return true
|
|
end
|
|
end
|
|
return false
|
|
end
|
|
|
|
#main method
|
|
def run(db, eps, min_pts, max_dimensions = 5, dimension_blacklist = [])
|
|
dimension_blacklist = Set.new(dimension_blacklist)
|
|
|
|
results = []
|
|
|
|
#cluster all subspaces in one dimension
|
|
c_and_s = subclu1(db, eps, min_pts, dimension_blacklist)
|
|
results.push c_and_s
|
|
|
|
while !c_and_s.empty? && results.size < max_dimensions
|
|
c_and_s_next = generate_candidate_subspaces(c_and_s, dimension_blacklist)
|
|
|
|
|
|
to_add_to_c_and_s_next = Hash.new
|
|
to_remove_c_and_s_next = []
|
|
c_and_s_next.each_pair do |subspace, clusters|
|
|
best_subspace = nil
|
|
best_subspace_cluster_count = (2**(0.size * 8 -2) -1) # maximum fixnum value
|
|
subspace.each do |dim|
|
|
s_k = subspace - [dim]
|
|
cluster_count = c_and_s[s_k].map {|cluster| cluster.size}.reduce(:+)
|
|
if (cluster_count < best_subspace_cluster_count)
|
|
best_subspace_cluster_count = cluster_count
|
|
best_subspace = s_k
|
|
end
|
|
end
|
|
|
|
clusters = []
|
|
c_and_s[best_subspace].each do |cl|
|
|
dbscan = DBscan.new( @measure.new(subspace) )
|
|
clusters += dbscan.run(cl, eps, min_pts)
|
|
|
|
if not clusters.empty?
|
|
to_add_to_c_and_s_next[subspace] = clusters
|
|
else
|
|
to_remove_c_and_s_next.push(subspace)
|
|
end
|
|
end
|
|
end
|
|
|
|
to_add_to_c_and_s_next.each_pair do |subspace, clusters|
|
|
c_and_s_next[subspace] = clusters
|
|
end
|
|
|
|
to_remove_c_and_s_next.each do |subspace|
|
|
c_and_s_next.delete(subspace)
|
|
end
|
|
|
|
|
|
results.push c_and_s_next if not c_and_s_next.empty?
|
|
c_and_s = c_and_s_next
|
|
end
|
|
return results
|
|
end
|
|
|
|
end
|
|
|