You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

118 lines
2.8 KiB
Ruby

$:.unshift File.dirname($0)
require "set"
require "dbscan"
class SUBCLU
def initialize(measure)
@measure = measure
end
#cluster in the first dimension
def subclu1(db, eps, min_pts, dimension_blacklist)
c_and_s = Hash.new
attributes = []; db.first.each_index {|attribute| attributes.push Set.new([attribute])}
attributes.each do |attribute|
if !dimension_blacklist.include?(attribute.to_a.first)
dbscan = DBscan.new( @measure.new(attribute) )
c_a = dbscan.run(db, eps, min_pts)
if !c_a.empty?
c_and_s[ attribute ] = c_a
end
end
end
return c_and_s
end
def generate_candidate_subspaces(c_and_s, dimension_blacklist)
c_and_s_next = Hash.new
#create candidate subspaces
c_and_s.each_key do |s_1|
c_and_s.each_key do |s_2|
if (s_1-s_2).size == 1
subspace = s_1+s_2
if ( (dimension_blacklist - subspace.to_a).eql?(dimension_blacklist) && !filter_subspace?(subspace, c_and_s))
c_and_s_next[(subspace)] = [] # clusters in higher subspace aren't yet known
end
end
end
end
return c_and_s_next
end
#filter subspaces
def filter_subspace?(subspace, c_and_s)
subspace.each do |dim|
s_k = subspace - [dim]
if c_and_s[s_k] == nil || c_and_s[s_k].empty?
return true
end
end
return false
end
#main method
def run(db, eps, min_pts, max_dimensions = 5, dimension_blacklist = [])
dimension_blacklist = Set.new(dimension_blacklist)
results = []
#cluster all subspaces in one dimension
c_and_s = subclu1(db, eps, min_pts, dimension_blacklist)
results.push c_and_s
while !c_and_s.empty? && results.size < max_dimensions
c_and_s_next = generate_candidate_subspaces(c_and_s, dimension_blacklist)
to_add_to_c_and_s_next = Hash.new
to_remove_c_and_s_next = []
c_and_s_next.each_pair do |subspace, clusters|
best_subspace = nil
best_subspace_cluster_count = (2**(0.size * 8 -2) -1) # maximum fixnum value
subspace.each do |dim|
s_k = subspace - [dim]
cluster_count = c_and_s[s_k].map {|cluster| cluster.size}.reduce(:+)
if (cluster_count < best_subspace_cluster_count)
best_subspace_cluster_count = cluster_count
best_subspace = s_k
end
end
clusters = []
c_and_s[best_subspace].each do |cl|
dbscan = DBscan.new( @measure.new(subspace) )
clusters += dbscan.run(cl, eps, min_pts)
if not clusters.empty?
to_add_to_c_and_s_next[subspace] = clusters
else
to_remove_c_and_s_next.push(subspace)
end
end
end
to_add_to_c_and_s_next.each_pair do |subspace, clusters|
c_and_s_next[subspace] = clusters
end
to_remove_c_and_s_next.each do |subspace|
c_and_s_next.delete(subspace)
end
results.push c_and_s_next if not c_and_s_next.empty?
c_and_s = c_and_s_next
end
return results
end
end