18
18
class TDR :
19
19
"""Class to interact with the Terra Data Repository (TDR) API."""
20
20
21
- TDR_LINK = "https://data.terra.bio/api/repository/v1"
21
+ PROD_LINK = "https://data.terra.bio/api/repository/v1"
22
+ DEV_LINK = "https://jade.datarepo-dev.broadinstitute.org/api/repository/v1"
22
23
"""(str): The base URL for the TDR API."""
23
24
24
- def __init__ (self , request_util : RunRequest ):
25
+ def __init__ (self , request_util : RunRequest , env : str = 'prod' ):
25
26
"""
26
27
Initialize the TDR class (A class to interact with the Terra Data Repository (TDR) API).
27
28
28
29
**Args:**
29
30
- request_util (`ops_utils.request_util.RunRequest`): Utility for making HTTP requests.
30
31
"""
31
32
self .request_util = request_util
33
+ if env .lower () == 'prod' :
34
+ self .tdr_link = self .PROD_LINK
35
+ elif env .lower () == 'dev' :
36
+ self .tdr_link = self .DEV_LINK
37
+ else :
38
+ raise RuntimeError (f"Unsupported environment: { env } . Must be 'prod' or 'dev'." )
32
39
"""@private"""
33
40
34
41
@staticmethod
@@ -89,7 +96,7 @@ def get_dataset_files(
89
96
**Returns:**
90
97
- list[dict]: A list of dictionaries containing the metadata of the files in the dataset.
91
98
"""
92
- uri = f"{ self .TDR_LINK } /datasets/{ dataset_id } /files"
99
+ uri = f"{ self .tdr_link } /datasets/{ dataset_id } /files"
93
100
logging .info (f"Getting all files in dataset { dataset_id } " )
94
101
return self ._get_response_from_batched_endpoint (uri = uri , limit = limit )
95
102
@@ -153,12 +160,12 @@ def get_sas_token(self, snapshot_id: str = "", dataset_id: str = "") -> dict:
153
160
- ValueError: If neither `snapshot_id` nor `dataset_id` is provided.
154
161
"""
155
162
if snapshot_id :
156
- uri = f"{ self .TDR_LINK } /snapshots/{ snapshot_id } ?include=ACCESS_INFORMATION"
163
+ uri = f"{ self .tdr_link } /snapshots/{ snapshot_id } ?include=ACCESS_INFORMATION"
157
164
response = self .request_util .run_request (uri = uri , method = GET )
158
165
snapshot_info = json .loads (response .text )
159
166
sas_token = snapshot_info ["accessInformation" ]["parquet" ]["sasToken" ]
160
167
elif dataset_id :
161
- uri = f"{ self .TDR_LINK } /datasets/{ dataset_id } ?include=ACCESS_INFORMATION"
168
+ uri = f"{ self .tdr_link } /datasets/{ dataset_id } ?include=ACCESS_INFORMATION"
162
169
response = self .request_util .run_request (uri = uri , method = GET )
163
170
snapshot_info = json .loads (response .text )
164
171
sas_token = snapshot_info ["accessInformation" ]["parquet" ]["sasToken" ]
@@ -182,7 +189,7 @@ def delete_file(self, file_id: str, dataset_id: str) -> requests.Response:
182
189
**Returns:**
183
190
- requests.Response: The response from the request.
184
191
"""
185
- uri = f"{ self .TDR_LINK } /datasets/{ dataset_id } /files/{ file_id } "
192
+ uri = f"{ self .tdr_link } /datasets/{ dataset_id } /files/{ file_id } "
186
193
logging .info (f"Submitting delete job for file { file_id } " )
187
194
return self .request_util .run_request (uri = uri , method = DELETE )
188
195
@@ -226,7 +233,7 @@ def add_user_to_dataset(self, dataset_id: str, user: str, policy: str) -> reques
226
233
- ValueError: If the policy is not valid.
227
234
"""
228
235
self ._check_policy (policy )
229
- uri = f"{ self .TDR_LINK } /datasets/{ dataset_id } /policies/{ policy } /members"
236
+ uri = f"{ self .tdr_link } /datasets/{ dataset_id } /policies/{ policy } /members"
230
237
member_dict = {"email" : user }
231
238
logging .info (f"Adding user { user } to dataset { dataset_id } with policy { policy } " )
232
239
return self .request_util .run_request (
@@ -253,7 +260,7 @@ def remove_user_from_dataset(self, dataset_id: str, user: str, policy: str) -> r
253
260
- ValueError: If the policy is not valid.
254
261
"""
255
262
self ._check_policy (policy )
256
- uri = f"{ self .TDR_LINK } /datasets/{ dataset_id } /policies/{ policy } /members/{ user } "
263
+ uri = f"{ self .tdr_link } /datasets/{ dataset_id } /policies/{ policy } /members/{ user } "
257
264
logging .info (f"Removing user { user } from dataset { dataset_id } with policy { policy } " )
258
265
return self .request_util .run_request (uri = uri , method = DELETE )
259
266
@@ -264,7 +271,7 @@ def delete_dataset(self, dataset_id: str) -> None:
264
271
**Args:**
265
272
dataset_id (str): The ID of the dataset to be deleted.
266
273
"""
267
- uri = f"{ self .TDR_LINK } /datasets/{ dataset_id } "
274
+ uri = f"{ self .tdr_link } /datasets/{ dataset_id } "
268
275
logging .info (f"Deleting dataset { dataset_id } " )
269
276
response = self .request_util .run_request (uri = uri , method = DELETE )
270
277
job_id = response .json ()['id' ]
@@ -308,7 +315,7 @@ def get_snapshot_info(
308
315
include_string = '&include=' .join (info_to_include )
309
316
else :
310
317
include_string = ""
311
- uri = f"{ self .TDR_LINK } /snapshots/{ snapshot_id } ?include={ include_string } "
318
+ uri = f"{ self .tdr_link } /snapshots/{ snapshot_id } ?include={ include_string } "
312
319
response = self .request_util .run_request (
313
320
uri = uri ,
314
321
method = GET ,
@@ -356,7 +363,7 @@ def delete_snapshot(self, snapshot_id: str) -> requests.Response:
356
363
**Returns:**
357
364
- requests.Response: The response from the request.
358
365
"""
359
- uri = f"{ self .TDR_LINK } /snapshots/{ snapshot_id } "
366
+ uri = f"{ self .tdr_link } /snapshots/{ snapshot_id } "
360
367
logging .info (f"Deleting snapshot { snapshot_id } " )
361
368
return self .request_util .run_request (uri = uri , method = DELETE )
362
369
@@ -383,7 +390,7 @@ def _yield_existing_datasets(
383
390
log_message = f"Searching for all datasets in batches of { batch_size } "
384
391
logging .info (log_message )
385
392
while True :
386
- uri = f"{ self .TDR_LINK } /datasets?offset={ offset } &limit={ batch_size } &sort=created_date&direction={ direction } { filter_str } " # noqa: E501
393
+ uri = f"{ self .tdr_link } /datasets?offset={ offset } &limit={ batch_size } &sort=created_date&direction={ direction } { filter_str } " # noqa: E501
387
394
response = self .request_util .run_request (uri = uri , method = GET )
388
395
datasets = response .json ()["items" ]
389
396
if not datasets :
@@ -459,7 +466,7 @@ def get_dataset_info(self, dataset_id: str, info_to_include: Optional[list[str]]
459
466
include_string = '&include=' .join (info_to_include )
460
467
else :
461
468
include_string = ""
462
- uri = f"{ self .TDR_LINK } /datasets/{ dataset_id } ?include={ include_string } "
469
+ uri = f"{ self .tdr_link } /datasets/{ dataset_id } ?include={ include_string } "
463
470
return self .request_util .run_request (uri = uri , method = GET )
464
471
465
472
def get_table_schema_info (
@@ -497,7 +504,7 @@ def get_job_result(self, job_id: str, expect_failure: bool = False) -> requests.
497
504
**Returns:**
498
505
- requests.Response: The response from the request.
499
506
"""
500
- uri = f"{ self .TDR_LINK } /jobs/{ job_id } /result"
507
+ uri = f"{ self .tdr_link } /jobs/{ job_id } /result"
501
508
# If job is expected to fail, accept any return code
502
509
acceptable_return_code = list (range (100 , 600 )) if expect_failure else []
503
510
return self .request_util .run_request (uri = uri , method = GET , accept_return_codes = acceptable_return_code )
@@ -513,7 +520,7 @@ def ingest_to_dataset(self, dataset_id: str, data: dict) -> requests.Response:
513
520
**Returns:**
514
521
- requests.Response: The response from the request.
515
522
"""
516
- uri = f"{ self .TDR_LINK } /datasets/{ dataset_id } /ingest"
523
+ uri = f"{ self .tdr_link } /datasets/{ dataset_id } /ingest"
517
524
logging .info (
518
525
"If recently added TDR SA to source bucket/dataset/workspace and you receive a 400/403 error, " +
519
526
"it can sometimes take up to 12/24 hours for permissions to propagate. Try rerunning the script later." )
@@ -543,7 +550,7 @@ def file_ingest_to_dataset(
543
550
**Returns:**
544
551
- dict: A dictionary containing the response from the ingest operation job monitoring.
545
552
"""
546
- uri = f"{ self .TDR_LINK } /datasets/{ dataset_id } /files/bulk/array"
553
+ uri = f"{ self .tdr_link } /datasets/{ dataset_id } /files/bulk/array"
547
554
data = {
548
555
"profileId" : profile_id ,
549
556
"loadTag" : f"{ load_tag } " ,
@@ -601,7 +608,7 @@ def _yield_dataset_metrics(self, dataset_id: str, target_table_name: str, query_
601
608
"limit" : query_limit ,
602
609
"sort" : "datarepo_row_id"
603
610
}
604
- uri = f"{ self .TDR_LINK } /datasets/{ dataset_id } /data/{ target_table_name } "
611
+ uri = f"{ self .tdr_link } /datasets/{ dataset_id } /data/{ target_table_name } "
605
612
while True :
606
613
batch_number = int ((search_request ["offset" ] / query_limit )) + 1 # type: ignore[operator]
607
614
response = self .request_util .run_request (
@@ -645,7 +652,7 @@ def get_job_status(self, job_id: str) -> requests.Response:
645
652
**Returns:**
646
653
- requests.Response: The response from the request.
647
654
"""
648
- uri = f"{ self .TDR_LINK } /jobs/{ job_id } "
655
+ uri = f"{ self .tdr_link } /jobs/{ job_id } "
649
656
return self .request_util .run_request (uri = uri , method = GET )
650
657
651
658
def get_dataset_file_uuids_from_metadata (self , dataset_id : str ) -> list [str ]:
@@ -707,7 +714,7 @@ def soft_delete_entries(
707
714
logging .info (f"No records found to soft delete in table { table_name } " )
708
715
return None
709
716
logging .info (f"Soft deleting { len (datarepo_row_ids )} records from table { table_name } " )
710
- uri = f"{ self .TDR_LINK } /datasets/{ dataset_id } /deletes"
717
+ uri = f"{ self .tdr_link } /datasets/{ dataset_id } /deletes"
711
718
payload = {
712
719
"deleteType" : "soft" ,
713
720
"specType" : "jsonArray" ,
@@ -766,6 +773,7 @@ def get_or_create_dataset(
766
773
billing_profile : str ,
767
774
schema : dict ,
768
775
description : str ,
776
+ relationships : Optional [list [dict ]] = None ,
769
777
delete_existing : bool = False ,
770
778
continue_if_exists : bool = False ,
771
779
additional_properties_dict : Optional [dict ] = None
@@ -778,6 +786,8 @@ def get_or_create_dataset(
778
786
- billing_profile (str): The billing profile ID.
779
787
- schema (dict): The schema of the dataset.
780
788
- description (str): The description of the dataset.
789
+ - relationships (Optional[list[dict]], optional): A list of relationships to add to the dataset schema.
790
+ Defaults to None.
781
791
- additional_properties_dict (Optional[dict], optional): Additional properties
782
792
for the dataset. Defaults to None.
783
793
- delete_existing (bool, optional): Whether to delete the existing dataset if found.
@@ -857,7 +867,7 @@ def create_dataset( # type: ignore[return]
857
867
CreateDatasetSchema (** dataset_properties ) # type: ignore[arg-type]
858
868
except ValidationError as e :
859
869
raise ValueError (f"Schema validation error: { e } " )
860
- uri = f"{ self .TDR_LINK } /datasets"
870
+ uri = f"{ self .tdr_link } /datasets"
861
871
logging .info (f"Creating dataset { dataset_name } under billing profile { profile_id } " )
862
872
response = self .request_util .run_request (
863
873
method = POST ,
@@ -895,7 +905,7 @@ def update_dataset_schema( # type: ignore[return]
895
905
**Raises:**
896
906
- ValueError: If the schema validation fails.
897
907
"""
898
- uri = f"{ self .TDR_LINK } /datasets/{ dataset_id } /updateSchema"
908
+ uri = f"{ self .tdr_link } /datasets/{ dataset_id } /updateSchema"
899
909
request_body : dict = {"description" : f"{ update_note } " , "changes" : {}}
900
910
if tables_to_add :
901
911
request_body ["changes" ]["addTables" ] = tables_to_add
@@ -968,7 +978,7 @@ def get_files_from_snapshot(self, snapshot_id: str, limit: int = 1000) -> list[d
968
978
**Returns:**
969
979
- list[dict]: A list of dictionaries containing the metadata of the files in the snapshot.
970
980
"""
971
- uri = f"{ self .TDR_LINK } /snapshots/{ snapshot_id } /files"
981
+ uri = f"{ self .tdr_link } /snapshots/{ snapshot_id } /files"
972
982
return self ._get_response_from_batched_endpoint (uri = uri , limit = limit )
973
983
974
984
def get_dataset_snapshots (self , dataset_id : str ) -> requests .Response :
@@ -981,12 +991,66 @@ def get_dataset_snapshots(self, dataset_id: str) -> requests.Response:
981
991
**Returns:**
982
992
- requests.Response: The response from the request.
983
993
"""
984
- uri = f"{ self .TDR_LINK } /snapshots?datasetIds={ dataset_id } "
994
+ uri = f"{ self .tdr_link } /snapshots?datasetIds={ dataset_id } "
985
995
return self .request_util .run_request (
986
996
uri = uri ,
987
997
method = GET
988
998
)
989
999
1000
+ def create_snapshot (
1001
+ self ,
1002
+ snapshot_name : str ,
1003
+ description : str ,
1004
+ dataset_name : str ,
1005
+ snapshot_mode : str , # byFullView is entire dataset
1006
+ profile_id : str ,
1007
+ stewards : Optional [list [str ]] = [],
1008
+ readers : Optional [list [str ]] = [],
1009
+ consent_code : Optional [str ] = None ,
1010
+ duos_id : Optional [str ] = None ,
1011
+ data_access_control_groups : Optional [list [str ]] = None ,
1012
+ ) -> None :
1013
+ """
1014
+ Create a snapshot in TDR.
1015
+
1016
+ **Returns:**
1017
+ - requests.Response: The response from the request.
1018
+ """
1019
+ uri = f"{ self .tdr_link } /snapshots"
1020
+ payload = {
1021
+ "name" : snapshot_name ,
1022
+ "description" : description ,
1023
+ "contents" : [
1024
+ {
1025
+ "datasetName" : dataset_name ,
1026
+ "mode" : snapshot_mode ,
1027
+ }
1028
+ ],
1029
+ "policies" : {
1030
+ "stewards" : stewards ,
1031
+ "readers" : readers ,
1032
+ },
1033
+ "profileId" : profile_id ,
1034
+ "globalFileIds" : True ,
1035
+ }
1036
+ if consent_code :
1037
+ payload ["consentCode" ] = consent_code
1038
+ if duos_id :
1039
+ payload ["duosId" ] = duos_id
1040
+ if data_access_control_groups :
1041
+ payload ["dataAccessControlGroups" ] = data_access_control_groups
1042
+ logging .info (f"Creating snapshot { snapshot_name } in dataset { dataset_name } " )
1043
+ response = self .request_util .run_request (
1044
+ uri = uri ,
1045
+ method = POST ,
1046
+ content_type = "application/json" ,
1047
+ data = json .dumps (payload )
1048
+ )
1049
+ job_id = response .json ()["id" ]
1050
+ job_results = MonitorTDRJob (tdr = self , job_id = job_id , check_interval = 30 , return_json = True ).run ()
1051
+ snapshot_id = job_results ["id" ] # type: ignore[index]
1052
+ logging .info (f"Successfully created snapshot { snapshot_name } - { snapshot_id } " )
1053
+
990
1054
991
1055
class FilterOutSampleIdsAlreadyInDataset :
992
1056
"""Class to filter ingest metrics to remove sample IDs that already exist in the dataset."""
0 commit comments