@@ -20,14 +20,16 @@ class TDR:
20
20
DEV_LINK = "https://jade.datarepo-dev.broadinstitute.org/api/repository/v1"
21
21
"""(str): The base URL for the TDR API."""
22
22
23
- def __init__ (self , request_util : RunRequest , env : str = 'prod' ):
23
+ def __init__ (self , request_util : RunRequest , env : str = 'prod' , dry_run : bool = False ):
24
24
"""
25
25
Initialize the TDR class (A class to interact with the Terra Data Repository (TDR) API).
26
26
27
27
**Args:**
28
28
- request_util (`ops_utils.request_util.RunRequest`): Utility for making HTTP requests.
29
29
"""
30
30
self .request_util = request_util
31
+ # NOTE: dry_run is not fully implemented in this class, only in delete_files_and_snapshots
32
+ self .dry_run = dry_run
31
33
if env .lower () == 'prod' :
32
34
self .tdr_link = self .PROD_LINK
33
35
elif env .lower () == 'dev' :
@@ -180,6 +182,47 @@ def delete_files(
180
182
check_interval = check_interval
181
183
).run ()
182
184
185
+ def _delete_snapshots_for_files (self , dataset_id : str , file_ids : set [str ]) -> None :
186
+ """Delete snapshots that reference any of the provided file IDs."""
187
+ snapshots_resp = self .get_dataset_snapshots (dataset_id = dataset_id )
188
+ snapshot_items = snapshots_resp .json ().get ('items' , [])
189
+ snapshots_to_delete = []
190
+ logging .info (
191
+ "Checking %d snapshots for references" ,
192
+ len (snapshot_items ),
193
+ )
194
+ for snap in snapshot_items :
195
+ snap_id = snap .get ('id' )
196
+ if not snap_id :
197
+ continue
198
+ snap_files = self .get_files_from_snapshot (snapshot_id = snap_id )
199
+ snap_file_ids = {
200
+ fd .get ('fileId' ) for fd in snap_files if fd .get ('fileId' )
201
+ }
202
+ # Use set intersection to check for any matching file IDs
203
+ if snap_file_ids & file_ids :
204
+ snapshots_to_delete .append (snap_id )
205
+ if snapshots_to_delete :
206
+ self .delete_snapshots (snapshot_ids = snapshots_to_delete )
207
+ else :
208
+ logging .info ("No snapshots reference the provided file ids" )
209
+
210
+ def _dry_run_msg (self ) -> str :
211
+ return '[Dry run] ' if self .dry_run else ''
212
+
213
+ def delete_files_and_snapshots (self , dataset_id : str , file_ids : set [str ]) -> None :
214
+ """Delete files from a dataset by their IDs, handling snapshots."""
215
+ self ._delete_snapshots_for_files (dataset_id = dataset_id , file_ids = file_ids )
216
+
217
+ logging .info (
218
+ f"{ self ._dry_run_msg ()} Submitting delete request for { len (file_ids )} files in "
219
+ f"dataset { dataset_id } " )
220
+ if not self .dry_run :
221
+ self .delete_files (
222
+ file_ids = list (file_ids ),
223
+ dataset_id = dataset_id
224
+ )
225
+
183
226
def add_user_to_dataset (self , dataset_id : str , user : str , policy : str ) -> requests .Response :
184
227
"""
185
228
Add a user to a dataset with a specified policy.
@@ -322,14 +365,16 @@ def delete_snapshots(
322
365
- check_interval (int, optional): The interval in seconds to wait between status checks. Defaults to `10`.
323
366
- verbose (bool, optional): Whether to log detailed information about each job. Defaults to `False`.
324
367
"""
325
- SubmitAndMonitorMultipleJobs (
326
- tdr = self ,
327
- job_function = self .delete_snapshot ,
328
- job_args_list = [(snapshot_id ,) for snapshot_id in snapshot_ids ],
329
- batch_size = batch_size ,
330
- check_interval = check_interval ,
331
- verbose = verbose
332
- ).run ()
368
+ logging .info (f"{ self ._dry_run_msg ()} Deleting { len (snapshot_ids )} snapshots" )
369
+ if not self .dry_run :
370
+ SubmitAndMonitorMultipleJobs (
371
+ tdr = self ,
372
+ job_function = self .delete_snapshot ,
373
+ job_args_list = [(snapshot_id ,) for snapshot_id in snapshot_ids ],
374
+ batch_size = batch_size ,
375
+ check_interval = check_interval ,
376
+ verbose = verbose
377
+ ).run ()
333
378
334
379
def delete_snapshot (self , snapshot_id : str ) -> requests .Response :
335
380
"""
@@ -937,6 +982,10 @@ def _get_response_from_batched_endpoint(self, uri: str, limit: int = 1000) -> li
937
982
break
938
983
939
984
metadata .extend (response_json )
985
+ if len (response_json ) < limit :
986
+ logging .info (f"Retrieved final batch of results, found { len (metadata )} total records" )
987
+ break
988
+
940
989
# Increment the offset by limit for the next page
941
990
offset += limit
942
991
batch += 1
0 commit comments