1919import gnu .trove .list .TIntList ;
2020import gnu .trove .list .array .TIntArrayList ;
2121import java .io .File ;
22+ import java .util .ArrayList ;
2223import java .util .List ;
2324import java .util .Scanner ;
2425import org .apache .log4j .BasicConfigurator ;
5051 *
5152 * @author GAP2
5253 */
53- public class Main {
54+ public class BlockingBasedWorkflow {
5455
55- private final static String MAIN_DIR_CCER_DATASETS = System .getProperty ("user.dir" ) + File .separator +"data" + File .separator + "cleanCleanErDatasets" + File .separator ;
56+ private final static String MAIN_DIR_CCER_DATASETS = System .getProperty ("user.dir" ) + File .separator + "data" + File .separator + "cleanCleanErDatasets" + File .separator ;
5657 private final static String MAIN_DIR_DER_DATASETS = System .getProperty ("user.dir" ) + File .separator + "data" + File .separator + "dirtyErDatasets" + File .separator ;
5758 private final static String [] CCER_ENTITY_FILEPATHS = {"abtProfiles" , "buyProfiles" ,
5859 "dblpProfiles" , "acmProfiles" ,
@@ -72,16 +73,20 @@ public class Main {
7273 private final static String [] CCER_DATASETS = {"Abt-Buy" , "DBLP-ACM" , "DBLP-Scholar" , "Amazon-Google Products" , "IMDB-DBPedia Movies" };
7374 private final static String [] BLOCK_BUILDING_METHODS = {"Extended Q-Grams Blocking" , "Extended Sorted Neighborhood" , "Extended Suffix Arrays Blocking" , "LSH Minhash Blocking" , "LSH Superbit Blocking" , "Q-Grams Blocking" , "Sorted Neighborhood" , "Standard/Token Blocking" , "Suffix Arrays Blocking" };
7475 private final static String [] BLOCK_CLEANING_METHODS = {"Block Filtering" , "Comparison-based Block Purging" , "Size-based Block Purging" };
75- private final static String [] COMPARISON_CLEANING_METHODS = {"Canopy Clustering" , "Cardinality Edge Pruning" , "Cardinality Node Pruning" , "Comparison Propagation" , "Extended Canopy Clustering" , "Reciprocal Cardinality Node Pruning" , "Reciprocal Weighed Node Pruning" , "Weighed Edge Pruning" , "Weighed Node Pruning" };
76+ private final static String [] COMPARISON_CLEANING_METHODS = {"Blast" , " Canopy Clustering" , "Cardinality Edge Pruning" , "Cardinality Node Pruning" , "Comparison Propagation" , "Extended Canopy Clustering" , "Reciprocal Cardinality Node Pruning" , "Reciprocal Weighed Node Pruning" , "Weighed Edge Pruning" , "Weighed Node Pruning" };
7677 private final static String [] ENTITY_MATCHING_METHODS = {"Group Linkage" , "Profile Matcher" };
7778 private final static String [] DIRTY_ER_ENTITY_CLUSTERING_METHODS = {"Center Clustering" , "Connected Components Clustering" , "Cut Clustering" , "Markov Clustering" , "Merge-Center Clustering" , "Ricochet SR Clustering" , "Correlation Clustering" };
7879
79- private static TIntList readMultipleInt (String message , String [] array ) {
80+ private static TIntList readMultipleInt (boolean optional , String message , String [] array ) {
8081 System .out .println ("\n \n " + message );
8182 for (int i = 0 ; i < array .length ; i ++) {
8283 System .out .println ((i + 1 ) + " - " + array [i ]);
8384 }
84- System .out .println ("This is an optional step. You can select none or all options. Choose -1 to terminate this step!" );
85+ if (optional ) {
86+ System .out .println ("This is an optional step. You can select none or all options. Choose -1 to terminate this step!" );
87+ } else {
88+ System .out .println ("Please select one or more of the available options. Choose -1 to terminate this step!" );
89+ }
8590
8691 final TIntList selectedIds = new TIntArrayList ();
8792 while (true ) {
@@ -198,14 +203,14 @@ private static int getDirtyErDataset() {
198203 return readInt (message , DER_DATASETS );
199204 }
200205
201- private static int getBlockBuildingMethod () {
202- String message = "Please choose one of the available Block Building methods:" ;
203- return readInt ( message , BLOCK_BUILDING_METHODS );
206+ private static TIntList getBlockBuildingMethod () {
207+ String message = "Please choose one or more of the available Block Building methods:" ;
208+ return readMultipleInt ( false , message , BLOCK_BUILDING_METHODS );
204209 }
205210
206211 private static TIntList getBlockCleaningMethod () {
207212 String message = "Please choose one, several or none of the available Block Cleaning methods:" ;
208- return readMultipleInt (message , BLOCK_CLEANING_METHODS );
213+ return readMultipleInt (true , message , BLOCK_CLEANING_METHODS );
209214 }
210215
211216 private static int getComparisonCleaningMethod () {
@@ -245,8 +250,7 @@ public static void main(String[] args) {
245250 profilesD2 = eReader2 .getEntityProfiles ();
246251 System .out .println ("Input Entity Profiles D2\t :\t " + profilesD2 .size ());
247252
248- final IGroundTruthReader gtReader = new GtSerializationReader (MAIN_DIR_CCER_DATASETS + CCER_GROUNDTRUTH_FILEPATHS [datasetId - 12
249- ]);
253+ final IGroundTruthReader gtReader = new GtSerializationReader (MAIN_DIR_CCER_DATASETS + CCER_GROUNDTRUTH_FILEPATHS [datasetId - 1 ]);
250254 duplicatePropagation = new BilateralDuplicatePropagation (gtReader .getDuplicatePairs (null ));
251255 System .out .println ("Existing Duplicates\t :\t " + duplicatePropagation .getDuplicates ().size ());
252256 } else {
@@ -266,41 +270,47 @@ public static void main(String[] args) {
266270 final StringBuilder workflowName = new StringBuilder ();
267271
268272 // Block Building
269- int bbMethodId = getBlockBuildingMethod ();
270- double time1 = System . currentTimeMillis ();
273+ final TIntList bbMethodIds = getBlockBuildingMethod ();
274+ List < AbstractBlock > blocks = new ArrayList <> ();
271275
272- final IBlockBuilding blockBuildingMethod = BlockBuildingMethod .getDefaultConfiguration (BlockBuildingMethod .values ()[bbMethodId - 1 ]);
273- List <AbstractBlock > blocks = blockBuildingMethod .getBlocks (profilesD1 , profilesD2 );
276+ double totalTime = 0 ;
277+ for (TIntIterator bbIterator = bbMethodIds .iterator (); bbIterator .hasNext ();) {
278+ double time1 = System .currentTimeMillis ();
279+
280+ final IBlockBuilding blockBuildingMethod = BlockBuildingMethod .getDefaultConfiguration (BlockBuildingMethod .values ()[bbIterator .next () - 1 ]);
281+ blocks .addAll (blockBuildingMethod .getBlocks (profilesD1 , profilesD2 ));
274282
275- double time2 = System .currentTimeMillis ();
276-
277- workflowConf .append (blockBuildingMethod .getMethodConfiguration ());
278- workflowName .append (blockBuildingMethod .getMethodName ());
283+ double time2 = System .currentTimeMillis ();
279284
285+ totalTime += time2 - time1 ;
286+ workflowConf .append (blockBuildingMethod .getMethodConfiguration ()).append ("\n " );
287+ workflowName .append (blockBuildingMethod .getMethodName ()).append ("->" );
288+ }
289+
280290 BlocksPerformance blStats = new BlocksPerformance (blocks , duplicatePropagation );
281291 blStats .setStatistics ();
282- blStats .printStatistics (time2 - time1 , workflowConf .toString (), workflowName .toString ());
292+ blStats .printStatistics (totalTime , workflowConf .toString (), workflowName .toString ());
283293
284294 // Block Cleaning
285295 final TIntList bcMethodIds = getBlockCleaningMethod ();
286296 if (!bcMethodIds .isEmpty ()) {
287297 bcMethodIds .sort ();
288298 bcMethodIds .reverse ();
289- final TIntIterator iterator = bcMethodIds .iterator ();
290- while (iterator .hasNext ()) {
299+ for (TIntIterator bcIterator = bcMethodIds .iterator (); bcIterator .hasNext ();) {
291300 double time3 = System .currentTimeMillis ();
292301
293- final IBlockProcessing blockCleaningMethod = BlockCleaningMethod .getDefaultConfiguration (BlockCleaningMethod .values ()[iterator .next () - 1 ]);
302+ final IBlockProcessing blockCleaningMethod = BlockCleaningMethod .getDefaultConfiguration (BlockCleaningMethod .values ()[bcIterator .next () - 1 ]);
294303 blocks = blockCleaningMethod .refineBlocks (blocks );
295304
296305 double time4 = System .currentTimeMillis ();
297306
298- workflowConf .append ("\n " ).append (blockCleaningMethod .getMethodConfiguration ());
299- workflowName .append ("->" ).append (blockCleaningMethod .getMethodName ());
307+ totalTime += time4 - time3 ;
308+ workflowConf .append (blockCleaningMethod .getMethodConfiguration ()).append ("\n " );
309+ workflowName .append (blockCleaningMethod .getMethodName ()).append ("->" );
300310
301311 blStats = new BlocksPerformance (blocks , duplicatePropagation );
302312 blStats .setStatistics ();
303- blStats .printStatistics (time4 - time3 , workflowConf .toString (), workflowName .toString ());
313+ blStats .printStatistics (totalTime , workflowConf .toString (), workflowName .toString ());
304314 }
305315 }
306316
@@ -314,12 +324,13 @@ public static void main(String[] args) {
314324
315325 double time6 = System .currentTimeMillis ();
316326
317- workflowConf .append ("\n " ).append (comparisonCleaningMethod .getMethodConfiguration ());
318- workflowName .append ("->" ).append (comparisonCleaningMethod .getMethodName ());
327+ totalTime += time6 - time5 ;
328+ workflowConf .append (comparisonCleaningMethod .getMethodConfiguration ()).append ("\n " );
329+ workflowName .append (comparisonCleaningMethod .getMethodName ()).append ("->" );
319330
320331 blStats = new BlocksPerformance (blocks , duplicatePropagation );
321332 blStats .setStatistics ();
322- blStats .printStatistics (time6 - time5 , blockBuildingMethod . getMethodConfiguration (), blockBuildingMethod . getMethodName ());
333+ blStats .printStatistics (totalTime , workflowConf . toString (), workflowName . toString ());
323334 }
324335
325336 // Entity Matching
@@ -331,8 +342,9 @@ public static void main(String[] args) {
331342
332343 double time8 = System .currentTimeMillis ();
333344
334- workflowConf .append ("\n " ).append (entityMatchingMethod .getMethodConfiguration ());
335- workflowName .append ("->" ).append (entityMatchingMethod .getMethodName ());
345+ totalTime += time8 - time7 ;
346+ workflowConf .append (entityMatchingMethod .getMethodConfiguration ()).append ("\n " );
347+ workflowName .append (entityMatchingMethod .getMethodName ()).append ("->" );
336348 System .out .println ("Entity Matching overhead time\t :\t " + (time8 - time7 ));
337349
338350 // Entity Clustering
@@ -351,11 +363,12 @@ public static void main(String[] args) {
351363
352364 long time10 = System .currentTimeMillis ();
353365
354- workflowConf .append ("\n " ).append (entityClusteringMethod .getMethodConfiguration ());
355- workflowName .append ("->" ).append (entityClusteringMethod .getMethodName ());
366+ totalTime += time10 - time9 ;
367+ workflowConf .append (entityClusteringMethod .getMethodConfiguration ());
368+ workflowName .append (entityClusteringMethod .getMethodName ());
356369
357370 ClustersPerformance clp = new ClustersPerformance (entityClusters , duplicatePropagation );
358371 clp .setStatistics ();
359- clp .printStatistics (time10 - time9 , workflowName .toString (), workflowConf .toString ());
372+ clp .printStatistics (totalTime , workflowName .toString (), workflowConf .toString ());
360373 }
361374}
0 commit comments