Skip to content

Commit c508d48

Browse files
tw4likreymer
andauthored
Download archived items as single WACZ file when possible via new download endpoint query parameter (#2850)
Fixes #2648 Replaces #2805 This PR introduces a `preferSingleWACZ` query parameter to the `/all-crawls/<crawl_id>/download` and `/crawls/<crawl_id>/download` endpoints. When set to true, these endpoints will only create multi-WACZs when a crawl has more than one WACZ file, and otherwise will stream the original crawl WACZ. This flag is not enabled by default to prevent introducing breaking changes to the API, but the frontend is updated to use it in all places where it seemed appropriate. A new backend test is also added to account for the change. --------- Co-authored-by: Ilya Kreymer <[email protected]>
1 parent f722d47 commit c508d48

File tree

8 files changed

+123
-24
lines changed

8 files changed

+123
-24
lines changed

backend/btrixcloud/basecrawls.py

Lines changed: 23 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -907,8 +907,15 @@ async def get_all_crawl_search_values(
907907
"firstSeeds": list(first_seeds),
908908
}
909909

910-
async def download_crawl_as_single_wacz(self, crawl_id: str, org: Organization):
911-
"""Download all WACZs in archived item as streaming nested WACZ"""
910+
async def download_crawl_as_single_wacz(
911+
self, crawl_id: str, org: Organization, prefer_single_wacz: bool = False
912+
):
913+
"""Download archived item as a single WACZ file
914+
915+
If prefer_single_wacz is false, always returns a multi-WACZ
916+
If prefer_single_wacz is true and archived item has only one WACZ,
917+
returns that instead
918+
"""
912919
crawl = await self.get_crawl_out(crawl_id, org)
913920

914921
if not crawl.resources:
@@ -921,9 +928,15 @@ async def download_crawl_as_single_wacz(self, crawl_id: str, org: Organization):
921928
if crawl.description:
922929
metadata["description"] = crawl.description
923930

924-
resp = await self.storage_ops.download_streaming_wacz(metadata, crawl.resources)
931+
resp = await self.storage_ops.download_streaming_wacz(
932+
metadata, crawl.resources, prefer_single_wacz=prefer_single_wacz
933+
)
925934

926-
headers = {"Content-Disposition": f'attachment; filename="{crawl_id}.wacz"'}
935+
filename = f"{crawl_id}.wacz"
936+
if len(crawl.resources) == 1 and prefer_single_wacz:
937+
filename = crawl.resources[0].name
938+
939+
headers = {"Content-Disposition": f'attachment; filename="{filename}"'}
927940
return StreamingResponse(
928941
resp, headers=headers, media_type="application/wacz+zip"
929942
)
@@ -1125,9 +1138,13 @@ async def get_crawl_out(
11251138
response_model=bytes,
11261139
)
11271140
async def download_base_crawl_as_single_wacz(
1128-
crawl_id: str, org: Organization = Depends(org_viewer_dep)
1141+
crawl_id: str,
1142+
preferSingleWACZ: bool = False,
1143+
org: Organization = Depends(org_viewer_dep),
11291144
):
1130-
return await ops.download_crawl_as_single_wacz(crawl_id, org)
1145+
return await ops.download_crawl_as_single_wacz(
1146+
crawl_id, org, prefer_single_wacz=preferSingleWACZ
1147+
)
11311148

11321149
@app.patch(
11331150
"/orgs/{oid}/all-crawls/{crawl_id}",

backend/btrixcloud/crawls.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1403,9 +1403,13 @@ async def get_crawl_out(
14031403
"/orgs/{oid}/crawls/{crawl_id}/download", tags=["crawls"], response_model=bytes
14041404
)
14051405
async def download_crawl_as_single_wacz(
1406-
crawl_id: str, org: Organization = Depends(org_viewer_dep)
1406+
crawl_id: str,
1407+
preferSingleWACZ: bool = False,
1408+
org: Organization = Depends(org_viewer_dep),
14071409
):
1408-
return await ops.download_crawl_as_single_wacz(crawl_id, org)
1410+
return await ops.download_crawl_as_single_wacz(
1411+
crawl_id, org, prefer_single_wacz=preferSingleWACZ
1412+
)
14091413

14101414
# QA APIs
14111415
# ---------------------

backend/btrixcloud/storages.py

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -829,9 +829,22 @@ def _sync_get_filestream(self, wacz_url: str, filename: str) -> Iterator[bytes]:
829829
yield from file_stream
830830

831831
def _sync_dl(
832-
self, metadata: dict[str, str], all_files: List[CrawlFileOut]
832+
self,
833+
metadata: dict[str, str],
834+
all_files: List[CrawlFileOut],
835+
prefer_single_wacz: bool = False,
833836
) -> Iterator[bytes]:
834837
"""generate streaming zip as sync"""
838+
839+
def get_file(path: str) -> Iterator[bytes]:
840+
path = self.resolve_internal_access_path(path)
841+
r = requests.get(path, stream=True, timeout=None)
842+
yield from r.iter_content(CHUNK_SIZE)
843+
844+
if len(all_files) == 1 and prefer_single_wacz:
845+
wacz_file = all_files[0]
846+
return get_file(wacz_file.path)
847+
835848
datapackage = {
836849
"profile": "multi-wacz-package",
837850
"resources": [
@@ -851,11 +864,6 @@ def _sync_dl(
851864
def get_datapackage() -> Iterable[bytes]:
852865
yield datapackage_bytes
853866

854-
def get_file(path: str) -> Iterable[bytes]:
855-
path = self.resolve_internal_access_path(path)
856-
r = requests.get(path, stream=True, timeout=None)
857-
yield from r.iter_content(CHUNK_SIZE)
858-
859867
def member_files() -> (
860868
Iterable[tuple[str, datetime, int, Method, Iterable[bytes]]]
861869
):
@@ -884,13 +892,18 @@ def member_files() -> (
884892
return cast(Iterator[bytes], stream_zip(member_files(), chunk_size=CHUNK_SIZE))
885893

886894
async def download_streaming_wacz(
887-
self, metadata: dict[str, str], files: List[CrawlFileOut]
895+
self,
896+
metadata: dict[str, str],
897+
files: List[CrawlFileOut],
898+
prefer_single_wacz: bool = False,
888899
) -> Iterator[bytes]:
889900
"""return an iter for downloading a stream nested wacz file
890901
from list of files"""
891902
loop = asyncio.get_event_loop()
892903

893-
resp = await loop.run_in_executor(None, self._sync_dl, metadata, files)
904+
resp = await loop.run_in_executor(
905+
None, self._sync_dl, metadata, files, prefer_single_wacz
906+
)
894907

895908
return resp
896909

backend/test/test_run_crawl.py

Lines changed: 64 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -422,7 +422,7 @@ def test_download_wacz_crawls(
422422
with ZipFile(fh, "r") as zip_file:
423423
contents = zip_file.namelist()
424424

425-
assert len(contents) >= 2
425+
assert len(contents) == 2
426426
for filename in contents:
427427
assert filename.endswith(".wacz") or filename == "datapackage.json"
428428
assert zip_file.getinfo(filename).compress_type == ZIP_STORED
@@ -437,6 +437,69 @@ def test_download_wacz_crawls(
437437
assert resource["bytes"]
438438

439439

440+
@pytest.mark.parametrize(
441+
"type_path",
442+
[
443+
# crawls endpoint
444+
("crawls"),
445+
# all-crawls endpoint
446+
("all-crawls"),
447+
],
448+
)
449+
def test_download_wacz_crawls_as_single_wacz(
450+
admin_auth_headers, default_org_id, admin_crawl_id, type_path
451+
):
452+
with TemporaryFile() as fh:
453+
with requests.get(
454+
f"{API_PREFIX}/orgs/{default_org_id}/{type_path}/{admin_crawl_id}/download?preferSingleWACZ=true",
455+
headers=admin_auth_headers,
456+
stream=True,
457+
) as r:
458+
assert r.status_code == 200
459+
for chunk in r.iter_content():
460+
fh.write(chunk)
461+
462+
fh.seek(0)
463+
with ZipFile(fh, "r") as zip_file:
464+
contents = zip_file.namelist()
465+
466+
assert len(contents) >= 6
467+
468+
assert "datapackage.json" in contents
469+
assert "datapackage-digest.json" in contents
470+
471+
archives_found = False
472+
indexes_found = False
473+
pages_found = False
474+
logs_found = False
475+
476+
for filename in contents:
477+
print(filename)
478+
if filename.startswith("archive/") and filename.endswith(".warc.gz"):
479+
archives_found = True
480+
if filename.startswith("indexes/"):
481+
indexes_found = True
482+
if filename.startswith("pages/") and filename.endswith(".jsonl"):
483+
pages_found = True
484+
if filename.startswith("logs/") and filename.endswith(".log"):
485+
logs_found = True
486+
487+
if filename == "datapackage.json":
488+
data = zip_file.read(filename).decode("utf-8")
489+
datapackage = json.loads(data)
490+
assert len(datapackage["resources"]) >= 6
491+
for resource in datapackage["resources"]:
492+
assert resource["name"]
493+
assert resource["path"]
494+
assert resource["hash"]
495+
assert resource["bytes"]
496+
497+
assert archives_found
498+
assert indexes_found
499+
assert pages_found
500+
assert logs_found
501+
502+
440503
def test_update_crawl(
441504
admin_auth_headers,
442505
default_org_id,

frontend/src/features/crawl-workflows/workflow-action-menu/workflow-action-menu.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -220,7 +220,7 @@ export class WorkflowActionMenu extends BtrixElement {
220220
return html`
221221
<sl-menu slot="submenu">
222222
<btrix-menu-item-link
223-
href=${`/api/orgs/${this.orgId}/all-crawls/${latestCrawl.id}/download?auth_bearer=${authToken}`}
223+
href=${`/api/orgs/${this.orgId}/all-crawls/${latestCrawl.id}/download?auth_bearer=${authToken}&preferSingleWACZ=true`}
224224
?disabled=${!latestCrawl.fileSize}
225225
download
226226
>

frontend/src/pages/org/archived-item-detail/archived-item-detail.ts

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -340,15 +340,17 @@ export class ArchivedItemDetail extends BtrixElement {
340340
case "files":
341341
sectionContent = this.renderPanel(
342342
html` ${this.renderTitle(this.tabLabels.files)}
343-
<sl-tooltip content=${msg("Download Files as Multi-WACZ")}>
343+
<sl-tooltip
344+
content=${msg("Download all files as a single WACZ file")}
345+
>
344346
<sl-button
345-
href=${`/api/orgs/${this.orgId}/all-crawls/${this.itemId}/download?auth_bearer=${authToken}`}
347+
href=${`/api/orgs/${this.orgId}/all-crawls/${this.itemId}/download?auth_bearer=${authToken}&preferSingleWACZ=true`}
346348
download=${`browsertrix-${this.itemId}.wacz`}
347349
size="small"
348350
variant="primary"
349351
>
350352
<sl-icon slot="prefix" name="cloud-download"></sl-icon>
351-
${msg("Download Files")}
353+
${msg("Download All")}
352354
</sl-button>
353355
</sl-tooltip>`,
354356
this.renderFiles(),
@@ -691,7 +693,7 @@ export class ArchivedItemDetail extends BtrixElement {
691693
`,
692694
)}
693695
<btrix-menu-item-link
694-
href=${`/api/orgs/${this.orgId}/all-crawls/${this.itemId}/download?auth_bearer=${authToken}`}
696+
href=${`/api/orgs/${this.orgId}/all-crawls/${this.itemId}/download?auth_bearer=${authToken}&preferSingleWACZ=true`}
695697
download
696698
>
697699
<sl-icon name="cloud-download" slot="prefix"></sl-icon>

frontend/src/pages/org/archived-items.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -786,7 +786,7 @@ export class CrawlsList extends BtrixElement {
786786
isSuccessfullyFinished(item),
787787
() => html`
788788
<btrix-menu-item-link
789-
href=${`/api/orgs/${this.orgId}/all-crawls/${item.id}/download?auth_bearer=${authToken}`}
789+
href=${`/api/orgs/${this.orgId}/all-crawls/${item.id}/download?auth_bearer=${authToken}&preferSingleWACZ=true`}
790790
download
791791
>
792792
<sl-icon name="cloud-download" slot="prefix"></sl-icon>

frontend/src/pages/org/workflow-detail.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -719,7 +719,7 @@ export class WorkflowDetail extends BtrixElement {
719719
const authToken = this.authState?.headers.Authorization.split(" ")[1];
720720
const disableDownload = this.isRunning;
721721
const disableReplay = !latestCrawl.fileSize;
722-
const replayHref = `/api/orgs/${this.orgId}/all-crawls/${latestCrawlId}/download?auth_bearer=${authToken}`;
722+
const replayHref = `/api/orgs/${this.orgId}/all-crawls/${latestCrawlId}/download?auth_bearer=${authToken}&preferSingleWACZ=true`;
723723
const replayFilename = `browsertrix-${latestCrawlId}.wacz`;
724724

725725
return html`

0 commit comments

Comments
 (0)