Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 19 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
## Features

- Converts Confluence pages to Markdown format.
- Uses the Atlassian API to export individual pages, pages including children, and whole spaces.
- Uses the Atlassian API to export individual pages, pages including children, folders with subfolders, and whole spaces.
- Supports various Confluence elements such as headings, paragraphs, lists, tables, and more.
- Retains formatting such as bold, italic, and underline.
- Converts Confluence macros to equivalent Markdown syntax where possible.
Expand Down Expand Up @@ -50,7 +50,7 @@ pip install confluence-markdown-exporter

### 2. Exporting

Run the exporter with the desired Confluence page ID or space key. Execute the console application by typing `confluence-markdown-exporter` and one of the commands `pages`, `pages-with-descendants`, `spaces`, `all-spaces` or `config`. If a command is unclear, you can always add `--help` to get additional information.
Run the exporter with the desired Confluence page ID, folder ID, or space key. Execute the console application by typing `confluence-markdown-exporter` and one of the commands `pages`, `pages-with-descendants`, `folders`, `spaces`, `all-spaces` or `config`. If a command is unclear, you can always add `--help` to get additional information.

> [!TIP]
> Instead of `confluence-markdown-exporter` you can also use the shorthand `cf-export`.
Expand Down Expand Up @@ -91,7 +91,23 @@ Export all Confluence pages of a single Space:
confluence-markdown-exporter spaces <space-key e.g. MYSPACE> <output path e.g. ./output_path/>
```

#### 2.3. Export all Spaces
#### 2.4. Export Folder

Export all Confluence pages within a folder and all its subfolders by folder ID:

```sh
confluence-markdown-exporter folders <folder-id e.g. 3491123> <output path e.g. ./output_path/>
```

or by URL:

```sh
confluence-markdown-exporter folders <folder-url e.g. https://company.atlassian.net/wiki/spaces/MYSPACE/folders/3491123> <output path e.g. ./output_path/>
```

This command **recursively exports all pages** from the specified folder and any nested subfolders within it. You can find the folder ID in the Confluence URL when viewing a folder, or from the folder's properties in Confluence.

#### 2.5. Export all Spaces

Export all Confluence pages across all spaces:

Expand Down
182 changes: 182 additions & 0 deletions confluence_markdown_exporter/confluence.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,116 @@ def from_key(cls, space_key: str) -> "Space":
)


class Folder(BaseModel):
id: str
title: str
space: Space

@property
def pages(self) -> list[int]:
"""Get all page IDs within this folder and its subfolders recursively."""
return self._get_all_pages()

def _get_all_pages(self) -> list[int]:
"""Recursively collect all page IDs from this folder and subfolders."""
page_ids = []
children = get_folder_children(self.id)

for child in children:
child_type = child.get("type")
child_id = child.get("id")

if child_type == "page" and child_id:
# It's a page - add it to our list
page_ids.append(int(child_id))
elif child_type == "folder" and child_id:
# It's a subfolder - recursively get its pages
try:
subfolder = Folder.from_id(child_id)
page_ids.extend(subfolder.pages)
except (ApiError, HTTPError) as e:
logger.warning(f"Could not access subfolder {child_id}: {e}")
continue

return page_ids

def export(self) -> None:
"""Export all pages within this folder."""
page_ids = self.pages
if not page_ids:
logger.warning(f"No pages found in folder '{self.title}' (ID: {self.id})")
export_pages(page_ids)

@classmethod
def from_json(cls, data: JsonResponse) -> "Folder":
"""Create a Folder instance from API JSON response."""
# Extract space key from the _links or _expandable section
space_key = ""
if "spaceId" in data:
# Try to get space from spaceId if available
space_id = data.get("spaceId", "")
try:
# Get space info from the v1 API
space_data = cast("JsonResponse", confluence.get_space(space_id, expand="homepage"))
space_key = space_data.get("key", "")
except (ApiError, HTTPError):
logger.warning(f"Could not fetch space for folder {data.get('id', '')}")

return cls(
id=data.get("id", ""),
title=data.get("title", ""),
space=Space.from_key(space_key) if space_key else Space(
key="", name="", description="", homepage=0
),
)

@classmethod
@functools.lru_cache(maxsize=100)
def from_id(cls, folder_id: str) -> "Folder":
"""Fetch a folder by ID and create a Folder instance."""
try:
folder_data = get_folder_by_id(folder_id)
return cls.from_json(folder_data)
except (ApiError, HTTPError) as e:
msg = f"Could not access folder with ID {folder_id}: {e}"
raise ValueError(msg) from e

@classmethod
def from_url(cls, folder_url: str) -> "Folder":
"""Retrieve a Folder object given a Confluence folder URL.

Supports URL patterns like:
- https://company.atlassian.net/wiki/spaces/SPACE/folders/123456
- https://company.atlassian.net/wiki/spaces/SPACE/pages/folders/123456
"""
url = urllib.parse.urlparse(folder_url)
hostname = url.hostname
if hostname and hostname not in str(settings.auth.confluence.url):
global confluence # noqa: PLW0603
set_setting("auth.confluence.url", f"{url.scheme}://{hostname}/")
confluence = get_confluence_instance() # Refresh instance with new URL

path = url.path.rstrip("/")

# Try pattern: /wiki/spaces/SPACE/folders/123456
if match := re.search(r"/wiki/spaces/[^/]+/folders/(\d+)", path):
folder_id = match.group(1)
return Folder.from_id(folder_id)

# Try pattern: /wiki/spaces/SPACE/pages/folders/123456
if match := re.search(r"/wiki/spaces/[^/]+/pages/folders/(\d+)", path):
folder_id = match.group(1)
return Folder.from_id(folder_id)

# Try pattern: /wiki/.+?/folders/123456 (generic)
if match := re.search(r"/wiki/.+?/folders/(\d+)", path):
folder_id = match.group(1)
return Folder.from_id(folder_id)

msg = f"Could not parse folder URL {folder_url}."
raise ValueError(msg)


class Label(BaseModel):
id: str
name: str
Expand Down Expand Up @@ -966,6 +1076,78 @@ def _get_path_for_href(self, path: Path, style: Literal["absolute", "relative"])
return result


def get_folder_by_id(folder_id: str) -> JsonResponse:
"""Fetch folder metadata using Confluence REST API v2.

Args:
folder_id: The folder ID.

Returns:
JSON response containing folder metadata.

Raises:
HTTPError: If the API request fails.
"""
url = f"api/v2/folders/{folder_id}"
response = confluence.get(url)
if not response:
msg = f"Folder with ID {folder_id} not found or not accessible"
raise ApiNotFoundError(msg)
return cast("JsonResponse", response)


def get_folder_children(folder_id: str) -> list[JsonResponse]:
"""Fetch all children (pages and subfolders) from a folder with pagination.

Args:
folder_id: The folder ID.

Returns:
List of child objects (pages and folders) with metadata.
"""
all_children = []
cursor = None
limit = 100

while True:
url = f"api/v2/folders/{folder_id}/children"
params = {"limit": limit}
if cursor:
params["cursor"] = cursor

try:
response = confluence.get(url, params=params)
if not response:
break

children = response.get("results", [])
if not children:
break

all_children.extend(children)

# Check for next page
links = response.get("_links", {})
if "next" in links:
next_url = links["next"]
if "cursor=" in next_url:
cursor = next_url.split("cursor=")[1].split("&")[0]
else:
break
else:
break

except HTTPError as e:
if e.response.status_code == 404: # noqa: PLR2004
logger.warning(
f"Folder with ID {folder_id} not found (404) when fetching children."
)
break
raise

return all_children


def export_page(page_id: int) -> None:
"""Export a Confluence page to Markdown.

Expand Down
24 changes: 24 additions & 0 deletions confluence_markdown_exporter/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,30 @@ def spaces(
space.export()


@app.command(help="Export all Confluence pages within one or more folders to Markdown.")
def folders(
folders: Annotated[list[str], typer.Argument(help="Folder ID(s) or URL(s)")],
output_path: Annotated[
Path | None,
typer.Option(
help="Directory to write exported Markdown files to. Overrides config if set."
),
] = None,
) -> None:
from confluence_markdown_exporter.confluence import Folder

with measure(f"Export folders {', '.join(folders)}"):
for folder in folders:
override_output_path_config(output_path)
# Detect if it's a URL or ID
_folder = (
Folder.from_url(folder)
if folder.startswith(("http://", "https://"))
else Folder.from_id(folder)
)
_folder.export()


@app.command(help="Export all Confluence pages across all spaces to Markdown.")
def all_spaces(
output_path: Annotated[
Expand Down
17 changes: 17 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pathlib import Path
from typing import Any
from unittest.mock import MagicMock
from unittest.mock import patch

import pytest
from pydantic import AnyHttpUrl
Expand All @@ -17,6 +18,22 @@
from confluence_markdown_exporter.utils.app_data_store import ExportConfig


def pytest_configure(config: pytest.Config) -> None: # noqa: ARG001
"""Configure pytest by mocking the Confluence instance before import."""
# Mock get_confluence_instance to avoid authentication during test collection
# This is needed because confluence.py creates a module-level instance
patcher = patch("confluence_markdown_exporter.api_clients.get_confluence_instance")
mock = patcher.start()
mock_client = MagicMock()
mock.return_value = mock_client

# Import the module now with the mock in place
import confluence_markdown_exporter.confluence # noqa: F401

# Stop the patcher after the module is loaded so individual tests can mock as needed
patcher.stop()


@pytest.fixture
def temp_config_dir() -> Generator[Path, None, None]:
"""Create a temporary directory for test configuration."""
Expand Down
Loading