Skip to content

Commit d96247c

Browse files
committed
Store image data in pandas from dry runs
for easier poking around what will/should be culled
1 parent 682470a commit d96247c

File tree

1 file changed

+44
-16
lines changed

1 file changed

+44
-16
lines changed

scripts/delete-old-images.py

+44-16
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,13 @@
2323
from functools import partial
2424
from pprint import pformat
2525

26-
from dateutil.parser import parse as parse_date
27-
28-
from aiohttp_client_cache import CachedSession, SQLiteBackend
2926
import aiohttp
27+
import pandas as pd
3028
import tqdm
3129
import tqdm.asyncio
3230
import yaml
31+
from aiohttp_client_cache import CachedSession, SQLiteBackend
32+
from dateutil.parser import parse as parse_date
3333

3434
HERE = os.path.dirname(__file__)
3535

@@ -125,7 +125,6 @@ async def list_images_catalog(session, registry_host):
125125
url = None
126126

127127

128-
129128
async def get_manifest(session, image):
130129
"""List the tags for an image
131130
@@ -150,6 +149,7 @@ async def delete_image(session, image, digest, tags, dry_run=False):
150149
if dry_run:
151150
fetch = session.get
152151
verb = "Checking"
152+
return
153153
else:
154154
fetch = session.delete
155155
verb = "Deleting"
@@ -216,7 +216,6 @@ async def bounded(f, *args, **kwargs):
216216
217217
This avoids the two separate queues contending with each other for slots.
218218
"""
219-
220219
async with semaphores[f]:
221220
return await f(*args, **kwargs)
222221

@@ -246,13 +245,12 @@ async def bounded(f, *args, **kwargs):
246245

247246
async with CachedSession(
248247
connector=aiohttp.TCPConnector(limit=2 * concurrency),
249-
cache=SQLiteBackend(expire_after=24 * 3600),
248+
cache=SQLiteBackend(expire_after=72 * 3600),
250249
**auth_kwargs,
251250
) as session:
252251

253252
print("Fetching images")
254253
tag_futures = []
255-
matches = 0
256254
repos_to_keep = 0
257255
repos_to_delete = 0
258256

@@ -262,6 +260,8 @@ def should_delete_repository(image):
262260
ci_string in image for ci_string in CI_STRINGS
263261
):
264262
return False
263+
else:
264+
return True
265265

266266
def should_fetch_repository(image):
267267
if not any(substring in image for substring in R2D_STRINGS):
@@ -294,11 +294,11 @@ def should_fetch_repository(image):
294294
raise RuntimeError(
295295
f"No images matching prefix {prefix}. Would delete all images!"
296296
)
297-
print(f"Not deleting {repos_to_keep} images starting with {prefix}")
297+
print(f"Not deleting {repos_to_keep} repos starting with {prefix}")
298298
if not tag_futures:
299299
print("Nothing to delete")
300300
return
301-
print(f"{len(tag_futures)} images to delete (not counting tags)")
301+
print(f"{len(tag_futures)} repos to consider for deletion (not counting tags)")
302302

303303
delete_futures = set()
304304
done = set()
@@ -325,41 +325,67 @@ def should_delete_tag(image, info):
325325
return False
326326

327327
# check cutoff
328-
image_ms = int(info["timeCreatedMs"])
328+
image_ms = int(info["timeUploadedMs"])
329329
image_datetime = datetime.fromtimestamp(image_ms / 1e3)
330330
# sanity check timestamps
331331
if image_datetime < FIVE_YEARS_AGO or image_datetime > TOMORROW:
332332
raise RuntimeError(
333333
f"Not deleting image with weird date: {image}, {info}, {image_datetime}"
334334
)
335335
if delete_before_ms > image_ms:
336-
# if dry_run:
337-
# print(
338-
# f"\nWould delete {image}:{','.join(info['tag'])} {image_datetime.isoformat()}"
339-
# )
340336
return True
341337
else:
342338
return False
343339

340+
def save_stats():
341+
df = pd.DataFrame(
342+
rows,
343+
columns=[
344+
"image",
345+
"digest",
346+
"tags",
347+
"size",
348+
"date",
349+
],
350+
)
351+
today = datetime.today()
352+
fname = f"registry-{release}-{today.strftime('%Y-%m-%d')}.pkl"
353+
df.to_pickle(fname)
354+
355+
rows = []
344356
try:
345357
for f in tqdm.tqdm(
346358
asyncio.as_completed(tag_futures),
347359
total=len(tag_futures),
348360
position=1,
349-
desc="images retrieved",
361+
desc="repos retrieved",
350362
):
351363
manifest = await f
352364
image = manifest["name"]
353365
delete_whole_repo = should_delete_repository(image)
354366
if delete_whole_repo and len(manifest["manifest"]) > 1:
355367
delete_progress.total += len(manifest["manifest"]) - 1
368+
356369
for digest, info in manifest["manifest"].items():
370+
image_ms = int(info["timeUploadedMs"])
371+
image_datetime = datetime.fromtimestamp(image_ms / 1e3)
372+
nbytes = int(info["imageSizeBytes"])
373+
rows.append(
374+
(
375+
image,
376+
digest,
377+
",".join(info["tag"]),
378+
nbytes,
379+
image_datetime,
380+
)
381+
)
382+
if len(rows) % 100 == 0:
383+
save_stats()
357384
if not should_delete_tag(image, info):
358385
continue
359386
if not delete_whole_repo:
360387
# not counted yet
361388
delete_progress.total += 1
362-
nbytes = int(info["imageSizeBytes"])
363389
delete_byte_progress.total += nbytes
364390
f = asyncio.ensure_future(
365391
bounded(
@@ -389,6 +415,8 @@ def should_delete_tag(image, info):
389415
if delete_futures:
390416
await asyncio.gather(*delete_futures)
391417
finally:
418+
save_stats()
419+
392420
delete_progress.close()
393421
delete_byte_progress.close()
394422
print("\n\n\n\n")

0 commit comments

Comments
 (0)