Skip to content

Commit ddcd11b

Browse files
committed
Store image data in pandas from dry runs
for easier poking around what will/should be culled checkpoint checkpoint checkpoint checkpoint
1 parent 682470a commit ddcd11b

File tree

1 file changed

+39
-10
lines changed

1 file changed

+39
-10
lines changed

scripts/delete-old-images.py

Lines changed: 39 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727

2828
from aiohttp_client_cache import CachedSession, SQLiteBackend
2929
import aiohttp
30+
import pandas as pd
3031
import tqdm
3132
import tqdm.asyncio
3233
import yaml
@@ -150,6 +151,7 @@ async def delete_image(session, image, digest, tags, dry_run=False):
150151
if dry_run:
151152
fetch = session.get
152153
verb = "Checking"
154+
return
153155
else:
154156
fetch = session.delete
155157
verb = "Deleting"
@@ -246,7 +248,7 @@ async def bounded(f, *args, **kwargs):
246248

247249
async with CachedSession(
248250
connector=aiohttp.TCPConnector(limit=2 * concurrency),
249-
cache=SQLiteBackend(expire_after=24 * 3600),
251+
cache=SQLiteBackend(expire_after=72 * 3600),
250252
**auth_kwargs,
251253
) as session:
252254

@@ -262,6 +264,8 @@ def should_delete_repository(image):
262264
ci_string in image for ci_string in CI_STRINGS
263265
):
264266
return False
267+
else:
268+
return True
265269

266270
def should_fetch_repository(image):
267271
if not any(substring in image for substring in R2D_STRINGS):
@@ -294,11 +298,11 @@ def should_fetch_repository(image):
294298
raise RuntimeError(
295299
f"No images matching prefix {prefix}. Would delete all images!"
296300
)
297-
print(f"Not deleting {repos_to_keep} images starting with {prefix}")
301+
print(f"Not deleting {repos_to_keep} repos starting with {prefix}")
298302
if not tag_futures:
299303
print("Nothing to delete")
300304
return
301-
print(f"{len(tag_futures)} images to delete (not counting tags)")
305+
print(f"{len(tag_futures)} repos to consider for deletion (not counting tags)")
302306

303307
delete_futures = set()
304308
done = set()
@@ -325,41 +329,64 @@ def should_delete_tag(image, info):
325329
return False
326330

327331
# check cutoff
328-
image_ms = int(info["timeCreatedMs"])
332+
image_ms = int(info["timeUploadedMs"])
329333
image_datetime = datetime.fromtimestamp(image_ms / 1e3)
330334
# sanity check timestamps
331335
if image_datetime < FIVE_YEARS_AGO or image_datetime > TOMORROW:
332336
raise RuntimeError(
333337
f"Not deleting image with weird date: {image}, {info}, {image_datetime}"
334338
)
335339
if delete_before_ms > image_ms:
336-
# if dry_run:
337-
# print(
338-
# f"\nWould delete {image}:{','.join(info['tag'])} {image_datetime.isoformat()}"
339-
# )
340340
return True
341341
else:
342342
return False
343343

344+
def save_stats():
345+
df = pd.DataFrame(rows, columns=[
346+
"image",
347+
"digest",
348+
"tags",
349+
"size",
350+
"date",
351+
])
352+
today = datetime.today()
353+
fname = f"registry-{release}-{today.strftime('%Y-%m-%d')}.pkl"
354+
df.to_pickle(fname)
355+
356+
rows = []
344357
try:
345358
for f in tqdm.tqdm(
346359
asyncio.as_completed(tag_futures),
347360
total=len(tag_futures),
348361
position=1,
349-
desc="images retrieved",
362+
desc="repos retrieved",
350363
):
351364
manifest = await f
352365
image = manifest["name"]
353366
delete_whole_repo = should_delete_repository(image)
354367
if delete_whole_repo and len(manifest["manifest"]) > 1:
355368
delete_progress.total += len(manifest["manifest"]) - 1
369+
356370
for digest, info in manifest["manifest"].items():
371+
image_ms = int(info["timeUploadedMs"])
372+
image_datetime = datetime.fromtimestamp(image_ms / 1e3)
373+
nbytes = int(info["imageSizeBytes"])
374+
rows.append(
375+
(
376+
image,
377+
digest,
378+
",".join(info["tag"]),
379+
nbytes,
380+
image_datetime,
381+
)
382+
)
383+
if len(rows) % 100 == 0:
384+
save_stats()
357385
if not should_delete_tag(image, info):
358386
continue
359387
if not delete_whole_repo:
360388
# not counted yet
361389
delete_progress.total += 1
362-
nbytes = int(info["imageSizeBytes"])
363390
delete_byte_progress.total += nbytes
364391
f = asyncio.ensure_future(
365392
bounded(
@@ -389,6 +416,8 @@ def should_delete_tag(image, info):
389416
if delete_futures:
390417
await asyncio.gather(*delete_futures)
391418
finally:
419+
save_stats()
420+
392421
delete_progress.close()
393422
delete_byte_progress.close()
394423
print("\n\n\n\n")

0 commit comments

Comments
 (0)