Skip to content

Commit 839aa5b

Browse files
committed
Add Hierarchical Density-Based Spatial Clustering (HDBSCAN) Community Detection
1 parent fd0e7b4 commit 839aa5b

6 files changed

+168
-4
lines changed
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Estimate
2+
3+
CALL gds.hdbscan.write.estimate(
4+
$dependencies_projection + '-cleaned', {
5+
nodeProperty: $dependencies_projection_node_embeddings_property,
6+
writeProperty: $dependencies_projection_write_property,
7+
samples: 3
8+
})
9+
YIELD requiredMemory
10+
,nodeCount
11+
,relationshipCount
12+
,bytesMin
13+
,bytesMax
14+
,heapPercentageMin
15+
,heapPercentageMax
16+
,treeView
17+
,mapView
18+
RETURN requiredMemory
19+
,nodeCount
20+
,relationshipCount
21+
,bytesMin
22+
,bytesMax
23+
,heapPercentageMin
24+
,heapPercentageMax
25+
,treeView
26+
//,mapView //doesn't work on Windows with git bash jq version jq-1.7-dirty
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Statistics
2+
3+
CALL gds.hdbscan.stats(
4+
$dependencies_projection + '-cleaned', {
5+
nodeProperty: $dependencies_projection_node_embeddings_property,
6+
samples: 3
7+
})
8+
YIELD nodeCount, numberOfClusters, numberOfNoisePoints, preProcessingMillis, computeMillis, postProcessingMillis
9+
RETURN nodeCount, numberOfClusters, numberOfNoisePoints, preProcessingMillis, computeMillis, postProcessingMillis
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Mutate
2+
3+
CALL gds.hdbscan.mutate(
4+
$dependencies_projection + '-cleaned', {
5+
nodeProperty: $dependencies_projection_node_embeddings_property,
6+
mutateProperty: $dependencies_projection_write_property,
7+
samples: 3
8+
})
9+
YIELD nodeCount, numberOfClusters, numberOfNoisePoints, nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, mutateMillis
10+
RETURN nodeCount, numberOfClusters, numberOfNoisePoints, nodePropertiesWritten, preProcessingMillis, computeMillis, postProcessingMillis, mutateMillis
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - Stream
2+
3+
CALL gds.hdbscan.stream(
4+
$dependencies_projection + '-cleaned', {
5+
nodeProperty: $dependencies_projection_node_embeddings_property,
6+
samples: 3
7+
})
8+
YIELD nodeId, label
9+
WITH gds.util.asNode(nodeId) AS member
10+
,label
11+
WITH member
12+
,coalesce(member.fqn, member.fileName, member.name) AS memberName
13+
,label
14+
WITH count(DISTINCT member) AS memberCount
15+
,collect(DISTINCT memberName) AS memberNames
16+
,label
17+
RETURN memberCount
18+
,label
19+
,memberNames
20+
ORDER BY memberCount DESC, label ASC
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
// Community Detection: Hierarchical Density-Based Spatial Clustering (HDBSCAN) - write node property e.g. communityHdbscanLabel
2+
3+
CALL gds.hdbscan.write(
4+
$dependencies_projection + '-cleaned', {
5+
nodeProperty: $dependencies_projection_node_embeddings_property,
6+
writeProperty: $dependencies_projection_write_property,
7+
samples: 3
8+
})
9+
// Samples = 3 turned out to be needed for
10+
YIELD nodeCount
11+
,numberOfClusters
12+
,numberOfNoisePoints
13+
,preProcessingMillis
14+
,computeMillis
15+
,writeMillis
16+
,postProcessingMillis
17+
,nodePropertiesWritten
18+
RETURN nodeCount
19+
,numberOfClusters
20+
,numberOfNoisePoints
21+
,preProcessingMillis
22+
,computeMillis
23+
,writeMillis
24+
,postProcessingMillis
25+
,nodePropertiesWritten

scripts/reports/CommunityCsv.sh

Lines changed: 78 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,73 @@ detectCommunitiesWithKCoreDecomposition() {
242242
calculateCommunityMetrics "${@}" "${writePropertyName}"
243243
}
244244

245+
# Node Embeddings using Fast Random Projection
246+
#
247+
# Required Parameters:
248+
# - dependencies_projection=...
249+
# Name prefix for the in-memory projection name for dependencies. Example: "package"
250+
# - dependencies_projection_node=...
251+
# Label of the nodes that will be used for the projection. Example: "Package"
252+
# - dependencies_projection_weight_property=...
253+
# Name of the node property that contains the dependency weight. Example: "weight"
254+
# - dependencies_projection_embedding_dimension=...
255+
# Number of the dimensions and therefore size of the resulting array of floating point numbers
256+
nodeEmbeddingsWithFastRandomProjectionForHDBSCAN() {
257+
local PROJECTION_CYPHER_DIR="${CYPHER_DIR}/Dependencies_Projection"
258+
local NODE_EMBEDDINGS_CYPHER_DIR="${CYPHER_DIR}/Node_Embeddings"
259+
local mutatePropertyName="dependencies_projection_write_property=embeddingsFastRandomProjection"
260+
local embeddingsDimension="dependencies_projection_embedding_dimension=2"
261+
262+
# Statistics
263+
execute_cypher "${NODE_EMBEDDINGS_CYPHER_DIR}/Node_Embeddings_1a_Fast_Random_Projection_Estimate.cypher" "${@}" "${mutatePropertyName}" ${embeddingsDimension}
264+
execute_cypher "${NODE_EMBEDDINGS_CYPHER_DIR}/Node_Embeddings_1b_Fast_Random_Projection_Statistics.cypher" "${@}" ${embeddingsDimension}
265+
266+
# Run the algorithm and write the result into the in-memory projection ("mutate")
267+
execute_cypher "${NODE_EMBEDDINGS_CYPHER_DIR}/Node_Embeddings_1c_Fast_Random_Projection_Mutate.cypher" "${@}" "${mutatePropertyName}" ${embeddingsDimension}
268+
}
269+
270+
# Community Detection using Hierarchical Density-Based Spatial Clustering (HDBSCAN) Algorithm
271+
#
272+
# Required Parameters:
273+
# - dependencies_projection=...
274+
# Name prefix for the in-memory projection name for dependencies. Example: "package"
275+
# - dependencies_projection_node=...
276+
# Label of the nodes that will be used for the projection. Example: "Package"
277+
# - dependencies_projection_weight_property=...
278+
# Name of the node property that contains the dependency weight. Example: "weight"
279+
# - dependencies_projection_node_embeddings_property=...
280+
# Name of the node property that contains node embeddings. Example: "embeddingsFastRandomProjection"
281+
#
282+
# Special Requirements:
283+
# - This algorithm needs a node property with an array of floats to compute clusters.
284+
# One possible way is to use node embeddings for that (like FastRP).
285+
detectCommunitiesWithHDBSCAN() {
286+
local COMMUNITY_DETECTION_CYPHER_DIR="${CYPHER_DIR}/Community_Detection"
287+
local PROJECTION_CYPHER_DIR="${CYPHER_DIR}/Dependencies_Projection"
288+
289+
local writePropertyName="dependencies_projection_write_property=communityHdbscanLabel"
290+
local writeLabelName="dependencies_projection_write_label=HDBSCAN"
291+
292+
# Statistics
293+
execute_cypher "${COMMUNITY_DETECTION_CYPHER_DIR}/Community_Detection_11a_HDBSCAN_Estimate.cypher" "${@}" "${writePropertyName}"
294+
execute_cypher "${COMMUNITY_DETECTION_CYPHER_DIR}/Community_Detection_11b_HDBSCAN_Statistics.cypher" "${@}"
295+
296+
# Run the algorithm and write the result into the in-memory projection ("mutate")
297+
execute_cypher "${COMMUNITY_DETECTION_CYPHER_DIR}/Community_Detection_11c_HDBSCAN_Mutate.cypher" "${@}" "${writePropertyName}"
298+
299+
# Stream to CSV
300+
local nodeLabel
301+
nodeLabel=$( extractQueryParameter "dependencies_projection_node" "${@}")
302+
execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_8_Stream_Mutated_Grouped.cypher" "${@}" "${writePropertyName}" > "${FULL_REPORT_DIRECTORY}/${nodeLabel}_Communities_HDBSCAN.csv"
303+
304+
# Update Graph (node properties and labels) using the already mutated property projection
305+
execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_9_Write_Mutated.cypher" "${@}" "${writePropertyName}"
306+
execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_10_Delete_Label.cypher" "${@}" "${writePropertyName}" "${writeLabelName}"
307+
execute_cypher "${PROJECTION_CYPHER_DIR}/Dependencies_11_Add_Label.cypher" "${@}" "${writePropertyName}" "${writeLabelName}"
308+
309+
calculateCommunityMetrics "${@}" "${writePropertyName}"
310+
}
311+
245312
# Community Detection using the Approximate Maximum k-cut Algorithm
246313
#
247314
# Required Parameters:
@@ -402,6 +469,13 @@ detectCommunities() {
402469
time detectCommunitiesWithKCoreDecomposition "${@}"
403470
time detectCommunitiesWithApproximateMaximumKCut "${@}"
404471
time calculateLocalClusteringCoefficient "${@}"
472+
473+
# TODO Hard-wire build-in dependencies_projection_node_embeddings_property
474+
nodeEmbeddingsProperty=$( extractQueryParameter "dependencies_projection_node_embeddings_property" "${@}")
475+
if [ -n "${nodeEmbeddingsProperty}" ]; then
476+
time nodeEmbeddingsWithFastRandomProjectionForHDBSCAN "${@}"
477+
time detectCommunitiesWithHDBSCAN "${@}"
478+
fi
405479
compareCommunityDetectionResults "${@}"
406480
listAllResults "${@}"
407481
}
@@ -415,7 +489,7 @@ ARTIFACT_GAMMA="dependencies_leiden_gamma=1.11" # default = 1.00
415489
ARTIFACT_KCUT="dependencies_maxkcut=5" # default = 2
416490

417491
if createUndirectedDependencyProjection "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}"; then
418-
detectCommunities "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}" "${ARTIFACT_GAMMA}" "${ARTIFACT_KCUT}"
492+
detectCommunities "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}" "${ARTIFACT_GAMMA}" "${ARTIFACT_KCUT}" # "${ARTIFACT_NODE_EMBEDDINGS}"
419493
writeLeidenModularity "${ARTIFACT_PROJECTION}" "${ARTIFACT_NODE}" "${ARTIFACT_WEIGHT}"
420494
fi
421495

@@ -426,9 +500,10 @@ PACKAGE_NODE="dependencies_projection_node=Package"
426500
PACKAGE_WEIGHT="dependencies_projection_weight_property=weight25PercentInterfaces"
427501
PACKAGE_GAMMA="dependencies_leiden_gamma=1.14" # default = 1.00
428502
PACKAGE_KCUT="dependencies_maxkcut=20" # default = 2
503+
PACKAGE_NODE_EMBEDDINGS="dependencies_projection_node_embeddings_property=embeddingsFastRandomProjection" # default = none
429504

430505
if createUndirectedDependencyProjection "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}"; then
431-
detectCommunities "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}" "${PACKAGE_GAMMA}" "${PACKAGE_KCUT}"
506+
detectCommunities "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}" "${PACKAGE_GAMMA}" "${PACKAGE_KCUT}" "${PACKAGE_NODE_EMBEDDINGS}"
432507
writeLeidenModularity "${PACKAGE_PROJECTION}" "${PACKAGE_NODE}" "${PACKAGE_WEIGHT}"
433508

434509
# Package Community Detection - Special CSV Queries after update
@@ -444,8 +519,7 @@ TYPE_GAMMA="dependencies_leiden_gamma=5.00" # default = 1.00
444519
TYPE_KCUT="dependencies_maxkcut=100" # default = 2
445520

446521
if createUndirectedJavaTypeDependencyProjection "${TYPE_PROJECTION}"; then
447-
detectCommunities "${TYPE_PROJECTION}" "${TYPE_NODE}" "${TYPE_WEIGHT}" "${TYPE_GAMMA}" "${TYPE_KCUT}"
448-
522+
detectCommunities "${TYPE_PROJECTION}" "${TYPE_NODE}" "${TYPE_WEIGHT}" "${TYPE_GAMMA}" "${TYPE_KCUT}" "${TYPE_NODE_EMBEDDINGS}"
449523
# Type Community Detection - Special CSV Queries after update
450524
execute_cypher "${CYPHER_DIR}/Community_Detection/Which_type_community_spans_several_artifacts_and_how_are_the_types_distributed.cypher" > "${FULL_REPORT_DIRECTORY}/Type_Communities_Leiden_That_Span_Multiple_Artifacts.csv"
451525
execute_cypher "${CYPHER_DIR}/Community_Detection/Type_communities_with_few_members_in_foreign_packages.cypher" > "${FULL_REPORT_DIRECTORY}/Type_communities_with_few_members_in_foreign_packages.csv"

0 commit comments

Comments
 (0)