Skip to content

Commit e2e4046

Browse files
committed
Extended CLI with support for multiple block building methods.
1 parent fb124bb commit e2e4046

File tree

2 files changed

+47
-35
lines changed

2 files changed

+47
-35
lines changed

jedai-core/src/main/java/org/scify/jedai/utilities/enumerations/BlockBuildingMethod.java

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@
3535
* @author G.A.P. II
3636
*/
3737
public enum BlockBuildingMethod {
38-
3938
EXTENDED_Q_GRAMS_BLOCKING,
4039
EXTENDED_SORTED_NEIGHBORHOOD,
4140
EXTENDED_SUFFIX_ARRAYS,

jedai-core/src/main/java/org/scify/jedai/workflowbuilder/Main.java renamed to jedai-core/src/main/java/org/scify/jedai/workflowbuilder/BlockingBasedWorkflow.java

Lines changed: 47 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import gnu.trove.list.TIntList;
2020
import gnu.trove.list.array.TIntArrayList;
2121
import java.io.File;
22+
import java.util.ArrayList;
2223
import java.util.List;
2324
import java.util.Scanner;
2425
import org.apache.log4j.BasicConfigurator;
@@ -50,9 +51,9 @@
5051
*
5152
* @author GAP2
5253
*/
53-
public class Main {
54+
public class BlockingBasedWorkflow {
5455

55-
private final static String MAIN_DIR_CCER_DATASETS = System.getProperty("user.dir") + File.separator +"data" + File.separator + "cleanCleanErDatasets" + File.separator;
56+
private final static String MAIN_DIR_CCER_DATASETS = System.getProperty("user.dir") + File.separator + "data" + File.separator + "cleanCleanErDatasets" + File.separator;
5657
private final static String MAIN_DIR_DER_DATASETS = System.getProperty("user.dir") + File.separator + "data" + File.separator + "dirtyErDatasets" + File.separator;
5758
private final static String[] CCER_ENTITY_FILEPATHS = {"abtProfiles", "buyProfiles",
5859
"dblpProfiles", "acmProfiles",
@@ -72,16 +73,20 @@ public class Main {
7273
private final static String[] CCER_DATASETS = {"Abt-Buy", "DBLP-ACM", "DBLP-Scholar", "Amazon-Google Products", "IMDB-DBPedia Movies"};
7374
private final static String[] BLOCK_BUILDING_METHODS = {"Extended Q-Grams Blocking", "Extended Sorted Neighborhood", "Extended Suffix Arrays Blocking", "LSH Minhash Blocking", "LSH Superbit Blocking", "Q-Grams Blocking", "Sorted Neighborhood", "Standard/Token Blocking", "Suffix Arrays Blocking"};
7475
private final static String[] BLOCK_CLEANING_METHODS = {"Block Filtering", "Comparison-based Block Purging", "Size-based Block Purging"};
75-
private final static String[] COMPARISON_CLEANING_METHODS = {"Canopy Clustering", "Cardinality Edge Pruning", "Cardinality Node Pruning", "Comparison Propagation", "Extended Canopy Clustering", "Reciprocal Cardinality Node Pruning", "Reciprocal Weighed Node Pruning", "Weighed Edge Pruning", "Weighed Node Pruning"};
76+
private final static String[] COMPARISON_CLEANING_METHODS = {"Blast", "Canopy Clustering", "Cardinality Edge Pruning", "Cardinality Node Pruning", "Comparison Propagation", "Extended Canopy Clustering", "Reciprocal Cardinality Node Pruning", "Reciprocal Weighed Node Pruning", "Weighed Edge Pruning", "Weighed Node Pruning"};
7677
private final static String[] ENTITY_MATCHING_METHODS = {"Group Linkage", "Profile Matcher"};
7778
private final static String[] DIRTY_ER_ENTITY_CLUSTERING_METHODS = {"Center Clustering", "Connected Components Clustering", "Cut Clustering", "Markov Clustering", "Merge-Center Clustering", "Ricochet SR Clustering", "Correlation Clustering"};
7879

79-
private static TIntList readMultipleInt(String message, String[] array) {
80+
private static TIntList readMultipleInt(boolean optional, String message, String[] array) {
8081
System.out.println("\n\n" + message);
8182
for (int i = 0; i < array.length; i++) {
8283
System.out.println((i + 1) + " - " + array[i]);
8384
}
84-
System.out.println("This is an optional step. You can select none or all options. Choose -1 to terminate this step!");
85+
if (optional) {
86+
System.out.println("This is an optional step. You can select none or all options. Choose -1 to terminate this step!");
87+
} else {
88+
System.out.println("Please select one or more of the available options. Choose -1 to terminate this step!");
89+
}
8590

8691
final TIntList selectedIds = new TIntArrayList();
8792
while (true) {
@@ -198,14 +203,14 @@ private static int getDirtyErDataset() {
198203
return readInt(message, DER_DATASETS);
199204
}
200205

201-
private static int getBlockBuildingMethod() {
202-
String message = "Please choose one of the available Block Building methods:";
203-
return readInt(message, BLOCK_BUILDING_METHODS);
206+
private static TIntList getBlockBuildingMethod() {
207+
String message = "Please choose one or more of the available Block Building methods:";
208+
return readMultipleInt(false, message, BLOCK_BUILDING_METHODS);
204209
}
205210

206211
private static TIntList getBlockCleaningMethod() {
207212
String message = "Please choose one, several or none of the available Block Cleaning methods:";
208-
return readMultipleInt(message, BLOCK_CLEANING_METHODS);
213+
return readMultipleInt(true, message, BLOCK_CLEANING_METHODS);
209214
}
210215

211216
private static int getComparisonCleaningMethod() {
@@ -245,8 +250,7 @@ public static void main(String[] args) {
245250
profilesD2 = eReader2.getEntityProfiles();
246251
System.out.println("Input Entity Profiles D2\t:\t" + profilesD2.size());
247252

248-
final IGroundTruthReader gtReader = new GtSerializationReader(MAIN_DIR_CCER_DATASETS + CCER_GROUNDTRUTH_FILEPATHS[datasetId - 12
249-
]);
253+
final IGroundTruthReader gtReader = new GtSerializationReader(MAIN_DIR_CCER_DATASETS + CCER_GROUNDTRUTH_FILEPATHS[datasetId - 1]);
250254
duplicatePropagation = new BilateralDuplicatePropagation(gtReader.getDuplicatePairs(null));
251255
System.out.println("Existing Duplicates\t:\t" + duplicatePropagation.getDuplicates().size());
252256
} else {
@@ -266,41 +270,47 @@ public static void main(String[] args) {
266270
final StringBuilder workflowName = new StringBuilder();
267271

268272
// Block Building
269-
int bbMethodId = getBlockBuildingMethod();
270-
double time1 = System.currentTimeMillis();
273+
final TIntList bbMethodIds = getBlockBuildingMethod();
274+
List<AbstractBlock> blocks = new ArrayList<>();
271275

272-
final IBlockBuilding blockBuildingMethod = BlockBuildingMethod.getDefaultConfiguration(BlockBuildingMethod.values()[bbMethodId - 1]);
273-
List<AbstractBlock> blocks = blockBuildingMethod.getBlocks(profilesD1, profilesD2);
276+
double totalTime = 0;
277+
for (TIntIterator bbIterator = bbMethodIds.iterator(); bbIterator.hasNext();) {
278+
double time1 = System.currentTimeMillis();
279+
280+
final IBlockBuilding blockBuildingMethod = BlockBuildingMethod.getDefaultConfiguration(BlockBuildingMethod.values()[bbIterator.next() - 1]);
281+
blocks.addAll(blockBuildingMethod.getBlocks(profilesD1, profilesD2));
274282

275-
double time2 = System.currentTimeMillis();
276-
277-
workflowConf.append(blockBuildingMethod.getMethodConfiguration());
278-
workflowName.append(blockBuildingMethod.getMethodName());
283+
double time2 = System.currentTimeMillis();
279284

285+
totalTime += time2 - time1;
286+
workflowConf.append(blockBuildingMethod.getMethodConfiguration()).append("\n");
287+
workflowName.append(blockBuildingMethod.getMethodName()).append("->");
288+
}
289+
280290
BlocksPerformance blStats = new BlocksPerformance(blocks, duplicatePropagation);
281291
blStats.setStatistics();
282-
blStats.printStatistics(time2 - time1, workflowConf.toString(), workflowName.toString());
292+
blStats.printStatistics(totalTime, workflowConf.toString(), workflowName.toString());
283293

284294
// Block Cleaning
285295
final TIntList bcMethodIds = getBlockCleaningMethod();
286296
if (!bcMethodIds.isEmpty()) {
287297
bcMethodIds.sort();
288298
bcMethodIds.reverse();
289-
final TIntIterator iterator = bcMethodIds.iterator();
290-
while (iterator.hasNext()) {
299+
for (TIntIterator bcIterator = bcMethodIds.iterator(); bcIterator.hasNext();) {
291300
double time3 = System.currentTimeMillis();
292301

293-
final IBlockProcessing blockCleaningMethod = BlockCleaningMethod.getDefaultConfiguration(BlockCleaningMethod.values()[iterator.next() - 1]);
302+
final IBlockProcessing blockCleaningMethod = BlockCleaningMethod.getDefaultConfiguration(BlockCleaningMethod.values()[bcIterator.next() - 1]);
294303
blocks = blockCleaningMethod.refineBlocks(blocks);
295304

296305
double time4 = System.currentTimeMillis();
297306

298-
workflowConf.append("\n").append(blockCleaningMethod.getMethodConfiguration());
299-
workflowName.append("->").append(blockCleaningMethod.getMethodName());
307+
totalTime += time4- time3;
308+
workflowConf.append(blockCleaningMethod.getMethodConfiguration()).append("\n");
309+
workflowName.append(blockCleaningMethod.getMethodName()).append("->");
300310

301311
blStats = new BlocksPerformance(blocks, duplicatePropagation);
302312
blStats.setStatistics();
303-
blStats.printStatistics(time4 - time3, workflowConf.toString(), workflowName.toString());
313+
blStats.printStatistics(totalTime, workflowConf.toString(), workflowName.toString());
304314
}
305315
}
306316

@@ -314,12 +324,13 @@ public static void main(String[] args) {
314324

315325
double time6 = System.currentTimeMillis();
316326

317-
workflowConf.append("\n").append(comparisonCleaningMethod.getMethodConfiguration());
318-
workflowName.append("->").append(comparisonCleaningMethod.getMethodName());
327+
totalTime += time6 - time5;
328+
workflowConf.append(comparisonCleaningMethod.getMethodConfiguration()).append("\n");
329+
workflowName.append(comparisonCleaningMethod.getMethodName()).append("->");
319330

320331
blStats = new BlocksPerformance(blocks, duplicatePropagation);
321332
blStats.setStatistics();
322-
blStats.printStatistics(time6 - time5, blockBuildingMethod.getMethodConfiguration(), blockBuildingMethod.getMethodName());
333+
blStats.printStatistics(totalTime, workflowConf.toString(), workflowName.toString());
323334
}
324335

325336
// Entity Matching
@@ -331,8 +342,9 @@ public static void main(String[] args) {
331342

332343
double time8 = System.currentTimeMillis();
333344

334-
workflowConf.append("\n").append(entityMatchingMethod.getMethodConfiguration());
335-
workflowName.append("->").append(entityMatchingMethod.getMethodName());
345+
totalTime += time8- time7;
346+
workflowConf.append(entityMatchingMethod.getMethodConfiguration()).append("\n");
347+
workflowName.append(entityMatchingMethod.getMethodName()).append("->");
336348
System.out.println("Entity Matching overhead time\t:\t" + (time8 - time7));
337349

338350
// Entity Clustering
@@ -351,11 +363,12 @@ public static void main(String[] args) {
351363

352364
long time10 = System.currentTimeMillis();
353365

354-
workflowConf.append("\n").append(entityClusteringMethod.getMethodConfiguration());
355-
workflowName.append("->").append(entityClusteringMethod.getMethodName());
366+
totalTime += time10 - time9;
367+
workflowConf.append(entityClusteringMethod.getMethodConfiguration());
368+
workflowName.append(entityClusteringMethod.getMethodName());
356369

357370
ClustersPerformance clp = new ClustersPerformance(entityClusters, duplicatePropagation);
358371
clp.setStatistics();
359-
clp.printStatistics(time10 - time9, workflowName.toString(), workflowConf.toString());
372+
clp.printStatistics(totalTime, workflowName.toString(), workflowConf.toString());
360373
}
361374
}

0 commit comments

Comments
 (0)