|
9 | 9 | from Diffraction.single_crystal.base_sx import BaseSX |
10 | 10 | import time |
11 | 11 | from enum import Enum |
12 | | -from sklearn.cluster import KMeans, HDBSCAN |
13 | | -from sklearn.preprocessing import StandardScaler, MinMaxScaler |
| 12 | +from sklearn.cluster import HDBSCAN |
14 | 13 | from sklearn.metrics import silhouette_score |
15 | 14 |
|
16 | 15 | class Clustering(Enum): |
17 | 16 | QLab = 1 |
18 | 17 | HDBSCAN = 2 |
19 | | - KMeans = 3 |
20 | 18 |
|
21 | 19 |
|
22 | 20 | class BraggDetectCNN: |
@@ -47,94 +45,74 @@ def __init__(self, model_weights_path, batch_size=64, workers=0, iou_threshold=0 |
47 | 45 | self.iou_threshold = iou_threshold |
48 | 46 |
|
49 | 47 |
|
50 | | - def find_bragg_peaks(self, workspace, output_ws_name="CNN_Peaks", conf_threshold=0.0, **kwargs): |
| 48 | + def find_bragg_peaks(self, workspace, output_ws_name="CNN_Peaks", conf_threshold=0.0, clustering=Clustering.QLab.name, **kwargs): |
51 | 49 | """ |
52 | 50 | Find bragg peaks using the pre trained FasterRCNN model and create a peaks workspace |
53 | 51 | :param workspace: Workspace name or the object of Workspace from WISH, ex: "WISH0042730" |
54 | 52 | :param output_ws_name: Name of the peaks workspace |
55 | 53 | :param conf_threshold: Confidence threshold to filter peaks inferred from RCNN |
56 | | - :param kwargs: variable keyword params for clustering. default is {"name": "Qlab", "q_tol": 0.05} |
57 | | - Ex: {"name": "HDBSCAN", "keep_ignored_labels": True} |
| 54 | + :param clustering: name of clustering method. Default is QLab and allowed |
| 55 | + :param kwargs: variable keyword params for clustering methods |
58 | 56 | """ |
59 | | - clustering_params = {"name": "QLab", "q_tol": 0.05 } |
60 | | - clustering_params.update(kwargs) |
61 | | - |
62 | 57 | start_time = time.time() |
63 | 58 | data_set, predicted_indices = self._do_cnn_inferencing(workspace) |
64 | 59 |
|
65 | 60 | filtered_indices = predicted_indices[predicted_indices[:, -1] > conf_threshold] |
66 | 61 |
|
67 | 62 | #Do Clustering |
68 | | - print(f"Starting peak clustering with { clustering_params['name'] } method..") |
69 | | - clustered_peaks = self._do_peak_clustering(filtered_indices, clustering_params) |
70 | | - print(f"Number of peaks after clustering is={len(clustered_peaks)}") |
71 | | - |
| 63 | + print(f"Starting peak clustering with {clustering} method..") |
| 64 | + clustered_peaks = self._do_peak_clustering(filtered_indices, clustering, **kwargs) |
72 | 65 | cluster_indices_rounded = np.round(clustered_peaks[:, :3]).astype(int) |
73 | 66 | peaksws = createPeaksWorkspaceFromIndices(data_set.get_workspace(), output_ws_name, cluster_indices_rounded, data_set.get_ws_as_3d_array()) |
74 | 67 | for ipk, pk in enumerate(peaksws): |
75 | 68 | pk.setIntensity(clustered_peaks[ipk, -1]) |
76 | 69 |
|
77 | | - if clustering_params["name"] == Clustering.QLab.name: |
| 70 | + if clustering == Clustering.QLab.name: |
78 | 71 | #Filter peaks by qlab |
79 | | - BaseSX.remove_duplicate_peaks_by_qlab(peaksws, clustering_params["q_tol"]) |
| 72 | + clustering_params = {"q_tol": 0.05 } |
| 73 | + clustering_params.update(kwargs) |
| 74 | + BaseSX.remove_duplicate_peaks_by_qlab(peaksws, **clustering_params) |
| 75 | + |
| 76 | + print(f"Number of peaks after clustering is = {len(peaksws)}") |
80 | 77 |
|
81 | 78 | data_set.delete_rebunched_ws() |
82 | | - print(f"Bragg peaks finding from FasterRCNN model is completed in {time.time()-start_time} seconds!") |
| 79 | + print(f"Bragg peaks finding from FasterRCNN model is completed in {time.time()-start_time:.2f} seconds!") |
83 | 80 |
|
84 | 81 |
|
85 | | - def _do_peak_clustering(self, detected_peaks, params): |
86 | | - print(f"Number of peaks before clustering={len(detected_peaks)}") |
87 | | - if params["name"] == Clustering.HDBSCAN.name: |
88 | | - return self._do_hdbscan_clustering(detected_peaks, params) |
89 | | - elif params["name"] == Clustering.KMeans.name: |
90 | | - return self._do_kmeans_clustering(detected_peaks) |
| 82 | + def _do_peak_clustering(self, detected_peaks, clustering, **kwargs): |
| 83 | + print(f"Number of peaks before clustering = {len(detected_peaks)}") |
| 84 | + if clustering == Clustering.HDBSCAN.name: |
| 85 | + return self._do_hdbscan_clustering(detected_peaks, **kwargs) |
91 | 86 | else: |
92 | 87 | return detected_peaks |
93 | 88 |
|
94 | 89 |
|
95 | | - def _do_hdbscan_clustering(self, peakdata, params): |
| 90 | + def _do_hdbscan_clustering(self, peakdata, keep_ignored_labels=True, **kwargs): |
96 | 91 | data = np.delete(peakdata, [3,4], axis=1) |
97 | | - |
98 | | - hdbscan = HDBSCAN(min_cluster_size=2, |
99 | | - min_samples=2, |
100 | | - store_centers="medoid", |
101 | | - algorithm="auto", |
102 | | - cluster_selection_method="eom", |
103 | | - metric="euclidean") |
| 92 | + if ("keep_ignored_labels" in kwargs): |
| 93 | + keep_ignored_labels = kwargs.pop("keep_ignored_labels") |
| 94 | + |
| 95 | + hdbscan_params = {"min_cluster_size": 2, |
| 96 | + "min_samples": 2, |
| 97 | + "store_centers" : "medoid", |
| 98 | + "algorithm": "auto", |
| 99 | + "cluster_selection_method": "eom", |
| 100 | + "metric": "euclidean" |
| 101 | + } |
| 102 | + hdbscan_params.update(kwargs) |
| 103 | + hdbscan = HDBSCAN(**hdbscan_params) |
104 | 104 | hdbscan.fit(data) |
105 | 105 | print(f"Silhouette score of the clusters={silhouette_score(data, hdbscan.labels_)}") |
106 | 106 |
|
107 | | - if ("keep_ignored_labels" in params) and params["keep_ignored_labels"]: |
108 | | - selected_peaks = np.concatenate((hdbscan.medoids_, data[np.where(hdbscan.labels_==-1)]), axis=0) |
| 107 | + if keep_ignored_labels: |
| 108 | + selected_peaks = np.concatenate((hdbscan.medoids_, data[np.where(hdbscan.labels_==-1)]), axis=0) |
109 | 109 | else: |
110 | 110 | selected_peaks = hdbscan.medoids_ |
111 | 111 | confidence = [] |
112 | 112 | for peak in selected_peaks: |
113 | 113 | confidence.append(peakdata[np.where((data == peak).all(axis=1))[0].item(), -1]) |
114 | 114 | return np.column_stack((selected_peaks, confidence)) |
115 | | - |
116 | | - |
117 | | - def _do_kmeans_clustering(self, peakdata): |
118 | | - stdScaler = StandardScaler() |
119 | | - peakdata[:, 3] = stdScaler.fit_transform(peakdata[:, 3].reshape(-1,1)).flatten() |
120 | | - minmaxScaler = MinMaxScaler() |
121 | | - peakdata[:, 4] = minmaxScaler.fit_transform(peakdata[:, 4].reshape(-1, 1)).flatten() |
122 | | - |
123 | | - WCSS = [] |
124 | | - cluster_range = range(1, len(peakdata), 2) |
125 | | - for i in cluster_range: |
126 | | - model = KMeans(n_clusters = i, init = 'k-means++') |
127 | | - model.fit(peakdata) |
128 | | - WCSS.append(model.inertia_) |
129 | | - |
130 | | - first_derivative = np.diff(WCSS, n=1) |
131 | | - elbow_point = np.argmax(first_derivative) + 1 |
132 | | - print(f"Selected elbow point={elbow_point} for KMeans clustering") |
133 | | - finalmodel = KMeans(n_clusters = elbow_point, init = "k-means++", max_iter = 500, n_init = 10, random_state = 0) |
134 | | - finalmodel.fit_predict(peakdata) |
135 | | - print(f"Silhouette score of the clusters={silhouette_score(peakdata, finalmodel.labels_)}") |
136 | | - return finalmodel.cluster_centers_ |
137 | | - |
| 115 | + |
138 | 116 |
|
139 | 117 | def _do_cnn_inferencing(self, workspace): |
140 | 118 | data_set = WISHWorkspaceDataSet(workspace) |
|
0 commit comments