23
23
from functools import partial
24
24
from pprint import pformat
25
25
26
- from dateutil .parser import parse as parse_date
27
-
28
- from aiohttp_client_cache import CachedSession , SQLiteBackend
29
26
import aiohttp
27
+ import pandas as pd
30
28
import tqdm
31
29
import tqdm .asyncio
32
30
import yaml
31
+ from aiohttp_client_cache import CachedSession , SQLiteBackend
32
+ from dateutil .parser import parse as parse_date
33
33
34
34
HERE = os .path .dirname (__file__ )
35
35
@@ -125,7 +125,6 @@ async def list_images_catalog(session, registry_host):
125
125
url = None
126
126
127
127
128
-
129
128
async def get_manifest (session , image ):
130
129
"""List the tags for an image
131
130
@@ -150,6 +149,7 @@ async def delete_image(session, image, digest, tags, dry_run=False):
150
149
if dry_run :
151
150
fetch = session .get
152
151
verb = "Checking"
152
+ return
153
153
else :
154
154
fetch = session .delete
155
155
verb = "Deleting"
@@ -216,7 +216,6 @@ async def bounded(f, *args, **kwargs):
216
216
217
217
This avoids the two separate queues contending with each other for slots.
218
218
"""
219
-
220
219
async with semaphores [f ]:
221
220
return await f (* args , ** kwargs )
222
221
@@ -246,13 +245,12 @@ async def bounded(f, *args, **kwargs):
246
245
247
246
async with CachedSession (
248
247
connector = aiohttp .TCPConnector (limit = 2 * concurrency ),
249
- cache = SQLiteBackend (expire_after = 24 * 3600 ),
248
+ cache = SQLiteBackend (expire_after = 72 * 3600 ),
250
249
** auth_kwargs ,
251
250
) as session :
252
251
253
252
print ("Fetching images" )
254
253
tag_futures = []
255
- matches = 0
256
254
repos_to_keep = 0
257
255
repos_to_delete = 0
258
256
@@ -262,6 +260,8 @@ def should_delete_repository(image):
262
260
ci_string in image for ci_string in CI_STRINGS
263
261
):
264
262
return False
263
+ else :
264
+ return True
265
265
266
266
def should_fetch_repository (image ):
267
267
if not any (substring in image for substring in R2D_STRINGS ):
@@ -294,11 +294,11 @@ def should_fetch_repository(image):
294
294
raise RuntimeError (
295
295
f"No images matching prefix { prefix } . Would delete all images!"
296
296
)
297
- print (f"Not deleting { repos_to_keep } images starting with { prefix } " )
297
+ print (f"Not deleting { repos_to_keep } repos starting with { prefix } " )
298
298
if not tag_futures :
299
299
print ("Nothing to delete" )
300
300
return
301
- print (f"{ len (tag_futures )} images to delete (not counting tags)" )
301
+ print (f"{ len (tag_futures )} repos to consider for deletion (not counting tags)" )
302
302
303
303
delete_futures = set ()
304
304
done = set ()
@@ -325,41 +325,67 @@ def should_delete_tag(image, info):
325
325
return False
326
326
327
327
# check cutoff
328
- image_ms = int (info ["timeCreatedMs " ])
328
+ image_ms = int (info ["timeUploadedMs " ])
329
329
image_datetime = datetime .fromtimestamp (image_ms / 1e3 )
330
330
# sanity check timestamps
331
331
if image_datetime < FIVE_YEARS_AGO or image_datetime > TOMORROW :
332
332
raise RuntimeError (
333
333
f"Not deleting image with weird date: { image } , { info } , { image_datetime } "
334
334
)
335
335
if delete_before_ms > image_ms :
336
- # if dry_run:
337
- # print(
338
- # f"\nWould delete {image}:{','.join(info['tag'])} {image_datetime.isoformat()}"
339
- # )
340
336
return True
341
337
else :
342
338
return False
343
339
340
+ def save_stats ():
341
+ df = pd .DataFrame (
342
+ rows ,
343
+ columns = [
344
+ "image" ,
345
+ "digest" ,
346
+ "tags" ,
347
+ "size" ,
348
+ "date" ,
349
+ ],
350
+ )
351
+ today = datetime .today ()
352
+ fname = f"registry-{ release } -{ today .strftime ('%Y-%m-%d' )} .pkl"
353
+ df .to_pickle (fname )
354
+
355
+ rows = []
344
356
try :
345
357
for f in tqdm .tqdm (
346
358
asyncio .as_completed (tag_futures ),
347
359
total = len (tag_futures ),
348
360
position = 1 ,
349
- desc = "images retrieved" ,
361
+ desc = "repos retrieved" ,
350
362
):
351
363
manifest = await f
352
364
image = manifest ["name" ]
353
365
delete_whole_repo = should_delete_repository (image )
354
366
if delete_whole_repo and len (manifest ["manifest" ]) > 1 :
355
367
delete_progress .total += len (manifest ["manifest" ]) - 1
368
+
356
369
for digest , info in manifest ["manifest" ].items ():
370
+ image_ms = int (info ["timeUploadedMs" ])
371
+ image_datetime = datetime .fromtimestamp (image_ms / 1e3 )
372
+ nbytes = int (info ["imageSizeBytes" ])
373
+ rows .append (
374
+ (
375
+ image ,
376
+ digest ,
377
+ "," .join (info ["tag" ]),
378
+ nbytes ,
379
+ image_datetime ,
380
+ )
381
+ )
382
+ if len (rows ) % 100 == 0 :
383
+ save_stats ()
357
384
if not should_delete_tag (image , info ):
358
385
continue
359
386
if not delete_whole_repo :
360
387
# not counted yet
361
388
delete_progress .total += 1
362
- nbytes = int (info ["imageSizeBytes" ])
363
389
delete_byte_progress .total += nbytes
364
390
f = asyncio .ensure_future (
365
391
bounded (
@@ -389,6 +415,8 @@ def should_delete_tag(image, info):
389
415
if delete_futures :
390
416
await asyncio .gather (* delete_futures )
391
417
finally :
418
+ save_stats ()
419
+
392
420
delete_progress .close ()
393
421
delete_byte_progress .close ()
394
422
print ("\n \n \n \n " )
0 commit comments