Coverage details for edu.uci.ics.jung.algorithms.cluster.VoltageClusterer

LineHitsSource
1 /*
2  * Copyright (c) 2004, the JUNG Project and the Regents of the University
3  * of California
4  * All rights reserved.
5  *
6  * This software is open-source under the BSD license; see either
7  * "license.txt" or
8  * http://jung.sourceforge.net/license.txt for a description.
9  *
10  * Created on Aug 12, 2004
11  */
12 package edu.uci.ics.jung.algorithms.cluster;
13  
14 import java.util.Arrays;
15 import java.util.Collection;
16 import java.util.Comparator;
17 import java.util.HashMap;
18 import java.util.HashSet;
19 import java.util.Iterator;
20 import java.util.LinkedList;
21 import java.util.Map;
22 import java.util.Set;
23  
24 import cern.jet.random.engine.DRand;
25 import cern.jet.random.engine.RandomEngine;
26 import edu.uci.ics.jung.algorithms.cluster.KMeansClusterer.NotEnoughClustersException;
27 import edu.uci.ics.jung.algorithms.importance.VoltageRanker;
28 import edu.uci.ics.jung.graph.ArchetypeGraph;
29 import edu.uci.ics.jung.graph.ArchetypeVertex;
30 import edu.uci.ics.jung.graph.Vertex;
31 import edu.uci.ics.jung.graph.decorators.UserDatumNumberVertexValue;
32 import edu.uci.ics.jung.statistics.DiscreteDistribution;
33  
34 /**
35  * <p>Clusters vertices of a <code>Graph</code> based on their ranks as
36  * calculated by <code>VoltageRanker</code>. This algorithm is based on,
37  * but not identical with, the method described in the paper below.
38  * The primary difference is that Wu and Huberman assume a priori that the clusters
39  * are of approximately the same size, and therefore use a more complex
40  * method than k-means (which is used here) for determining cluster
41  * membership based on co-occurrence data.</p>
42  *
43  * <p>The algorithm proceeds as follows:
44  * <ul>
45  * <li/>first, generate a set of candidate clusters as follows:
46  * <ul>
47  * <li/>pick (widely separated) vertex pair, run VoltageRanker
48  * <li/>group the vertices in two clusters according to their voltages
49  * <li/>store resulting candidate clusters
50  * </ul>
51  * <li/>second, generate k-1 clusters as follows:
52  * <ul>
53  * <li/>pick a vertex v as a cluster 'seed'
54  * <br>(Wu/Huberman: most frequent vertex in candidate clusters)
55  * <li/>calculate co-occurrence over all candidate clusters of v with each other
56  * vertex
57  * <li/>separate co-occurrence counts into high/low;
58  * high vertices constitute a cluster
59  * <li/>remove v's vertices from candidate clusters; continue
60  * </ul>
61  * <li/>finally, remaining unassigned vertices are assigned to the kth ("garbage")
62  * cluster.
63  * </ul></p>
64  *
65  * <p><b>NOTE</b>: Depending on how the co-occurrence data splits the data into
66  * clusters, the number of clusters returned by this algorithm may be less than the
67  * number of clusters requested. The number of clusters will never be more than
68  * the number requested, however.</p>
69  *
70  * @author Joshua O'Madadhain
71  * @see <a href="http://www.hpl.hp.com/research/idl/papers/linear/">'Finding communities in linear time: a physics approach', Fang Wu and Bernardo Huberman</a>
72  * @see VoltageRanker
73  * @see KMeansClusterer
74  */
75 public class VoltageClusterer
76 {
77     public final static String VOLTAGE_KEY = "edu.uci.ics.jung.algorithms.cluster.VoltageClusterer.KEY";
78     
79     protected int num_candidates;
80     protected KMeansClusterer kmc;
81     protected UserDatumNumberVertexValue vv;
82     protected VoltageRanker vr;
83     protected RandomEngine rand;
84     
85     /**
86      * Creates an instance of a VoltageCluster with the specified parameters.
87      * These are mostly parameters that are passed directly to VoltageRanker
88      * and KMeansClusterer.
89      *
90      * @param num_candidates the number of candidate clusters to create
91      * @param rank_iterations the number of iterations to run VoltageRanker
92      * @param cluster_iterations the number of iterations to run KMeansClusterer
93      * @param cluster_convergence the convergence value for KMeansClusterer
94      */
95     public VoltageClusterer(int num_candidates, int rank_iterations,
96         double rank_convergence, int cluster_iterations, double cluster_convergence)
970    {
980        if (num_candidates < 1)
990            throw new IllegalArgumentException("must generate >=1 candidates");
100  
1010        this.num_candidates = num_candidates;
1020        this.vv = new UserDatumNumberVertexValue(VOLTAGE_KEY);
1030        this.vr = new VoltageRanker(vv, rank_iterations, rank_convergence);
1040        this.kmc = new KMeansClusterer(cluster_iterations, cluster_convergence);
1050        rand = new DRand();
1060    }
107     
108     protected void setRandomSeed(int random_seed)
109     {
1100        rand = new DRand(random_seed);
1110    }
112     
113     /**
114      * Returns a community (cluster) centered around <code>v</code>.
115      * @param v the vertex whose community we wish to discover
116      */
117     public Collection getCommunity(ArchetypeVertex v)
118     {
1190        return cluster_internal(v.getGraph(), v, 2);
120     }
121  
122     /**
123      * Clusters the vertices of <code>g</code> into
124      * <code>num_clusters</code> clusters, based on their connectivity.
125      * @param g the graph whose vertices are to be clustered
126      * @param num_clusters the number of clusters to identify
127      */
128     public Collection cluster(ArchetypeGraph g, int num_clusters)
129     {
1300        return cluster_internal(g, null, num_clusters);
131     }
132     
133     /**
134      * Clears the voltage decoration values from the vertices of <code>g</code>.
135      */
136     public void clear(ArchetypeGraph g)
137     {
1380        vv.clear(g);
1390    }
140     
141     /**
142      * Does the work of <code>getCommunity</code> and <code>cluster</code>.
143      * @param g the graph whose vertices we're clustering
144      * @param origin the center (if one exists) of the graph to cluster
145      * @param num_clusters
146      */
147     protected Collection cluster_internal(ArchetypeGraph g,
148         ArchetypeVertex origin, int num_clusters)
149     {
150         // generate candidate clusters
151         // repeat the following 'samples' times:
152         // * pick (widely separated) vertex pair, run VoltageRanker
153         // * use k-means to identify 2 communities in ranked graph
154         // * store resulting candidate communities
1550        Set vertices = g.getVertices();
1560        int num_vertices = vertices.size();
1570        ArchetypeVertex[] v = new ArchetypeVertex[num_vertices];
1580        int i = 0;
1590        for (Iterator iter = vertices.iterator(); iter.hasNext(); )
1600            v[i++] = (ArchetypeVertex)iter.next();
161  
1620        LinkedList candidates = new LinkedList();
163         
1640        for (int j = 0; j < num_candidates; j++)
165         {
166             ArchetypeVertex source;
1670            if (origin == null)
1680                source = v[(int)(rand.nextDouble() * num_vertices)];
169             else
1700                source = origin;
1710            ArchetypeVertex target = null;
172             do
173             {
1740                target = v[(int)(rand.nextDouble() * num_vertices)];
175             }
1760            while (source == target);
1770            vr.calculateVoltages((Vertex)source, (Vertex)target);
178             
1790            Map voltage_ranks = new HashMap();
1800            for (Iterator iter = vertices.iterator(); iter.hasNext(); )
181             {
1820                ArchetypeVertex w = (ArchetypeVertex)iter.next();
1830                double[] voltage = {vv.getNumber(w).doubleValue()};
1840                voltage_ranks.put(w, voltage);
185             }
186             
187 // Map clusterMap;
188             Collection clusters;
189             try
190             {
1910                clusters = kmc.cluster(voltage_ranks, 2);
1920                Iterator iter = clusters.iterator();
1930                candidates.add(((Map)iter.next()).keySet());
1940                candidates.add(((Map)iter.next()).keySet());
195 // candidates.addAll(clusters);
196             }
1970            catch (NotEnoughClustersException e)
198             {
199                 // ignore this candidate, continue
2000            }
201         }
202         
203         // repeat the following k-1 times:
204         // * pick a vertex v as a cluster seed
205         // (Wu/Huberman: most frequent vertex in candidates)
206         // * calculate co-occurrence (in candidate clusters)
207         // of this vertex with all others
208         // * use k-means to separate co-occurrence counts into high/low;
209         // high vertices are a cluster
210         // * remove v's vertices from candidate clusters
211         
2120        Collection clusters = new LinkedList();
2130        Collection remaining = new HashSet(g.getVertices());
2140        Object[] seed_candidates = getSeedCandidates(candidates);
2150        int seed_index = 0;
216         
2170        for (int j = 0; j < (num_clusters - 1); j++)
218         {
2190            if (remaining.isEmpty())
2200                break;
221                 
222             Object seed;
2230            if (seed_index == 0 && origin != null)
2240                seed = origin;
225             else
226             {
2270                do { seed = seed_candidates[seed_index++]; }
2280                while (!remaining.contains(seed));
229             }
230             
2310            Map occur_counts = getObjectCounts(candidates, seed);
2320            if (occur_counts.size() < 2)
2330                break;
234             
235             // now that we have the counts, cluster them...
236             try
237             {
2380                Collection high_low = kmc.cluster(occur_counts, 2);
239                 // ...get the cluster with the highest-valued centroid...
2400                Iterator h_iter = high_low.iterator();
2410                Map cluster1 = (Map)h_iter.next();
2420                Map cluster2 = (Map)h_iter.next();
2430                double[] centroid1 = DiscreteDistribution.mean(cluster1.values());
2440                double[] centroid2 = DiscreteDistribution.mean(cluster2.values());
245 // double[] centroid1 = (double[])h_iter.next();
246 // double[] centroid2 = (double[])h_iter.next();
247                 Collection new_cluster;
2480                if (centroid1[0] >= centroid2[0])
2490                    new_cluster = cluster1.keySet();
250                 else
2510                    new_cluster = cluster2.keySet();
252                 
253                 // ...remove the elements of new_cluster from each candidate...
2540                for (Iterator iter = candidates.iterator(); iter.hasNext(); )
255                 {
2560                    Collection cluster = (Collection)iter.next();
2570                    cluster.removeAll(new_cluster);
258                 }
2590                clusters.add(new_cluster);
2600                remaining.removeAll(new_cluster);
261             }
2620            catch (NotEnoughClustersException nece)
263             {
264                 // all remaining vertices are in the same cluster
2650                break;
2660            }
267         }
268         
269         // identify remaining vertices (if any) as a 'garbage' cluster
2700        if (!remaining.isEmpty())
2710            clusters.add(remaining);
272         
2730        return clusters;
274     }
275     
276     protected Object getRandomElement(Collection c)
277     {
2780        return c.toArray()[(int)(rand.nextDouble() * c.size())];
279     }
280     
281     /**
282      * Returns an array of cluster seeds, ranked in decreasing order
283      * of number of appearances in the specified collection of candidate
284      * clusters.
285      * @param candidates
286      */
287     protected Object[] getSeedCandidates(Collection candidates)
288     {
2890        final Map occur_counts = getObjectCounts(candidates, null);
290         
2910        Object[] occurrences = occur_counts.keySet().toArray();
2920        Arrays.sort(occurrences, new Comparator()
293             {
294                 public int compare(Object arg0, Object arg1)
295                 {
296                     double[] count0 = (double[])occur_counts.get(arg0);
297                     double[] count1 = (double[])occur_counts.get(arg1);
298                     if (count0[0] < count1[0])
299                         return -1;
300                     else if (count0[0] > count1[0])
301                         return 1;
302                     else
303                         return 0;
304                 }
305             });
3060        return occurrences;
307     }
308  
309     protected Map getObjectCounts(Collection candidates, Object seed)
310     {
3110        Map occur_counts = new HashMap();
3120        for (Iterator iter = candidates.iterator(); iter.hasNext(); )
313         {
3140            Collection candidate = (Collection)iter.next();
3150            if (seed == null || candidate.contains(seed))
316             {
3170                for (Iterator c_iter = candidate.iterator(); c_iter.hasNext(); )
318                 {
3190                    Object element = c_iter.next();
3200                    double[] count = (double[])occur_counts.get(element);
3210                    if (count == null)
322                     {
3230                        count = new double[1];
3240                        occur_counts.put(element, count);
325                     }
3260                    else count[0]++;
327                 }
328             }
329         }
330         
3310        return occur_counts;
332     }
333 }

this report was generated by version 1.0.5 of jcoverage.
visit www.jcoverage.com for updates.

copyright © 2003, jcoverage ltd. all rights reserved.
Java is a trademark of Sun Microsystems, Inc. in the United States and other countries.