Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions NOTICE.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1880,7 +1880,7 @@ SOFTWARE.


azure-core
1.35.1
1.36.0
MIT License
Copyright (c) Microsoft Corporation.

Expand Down Expand Up @@ -2606,7 +2606,7 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND


cachetools
6.2.0
6.2.1
MIT License
The MIT License (MIT)

Expand Down Expand Up @@ -2687,7 +2687,7 @@ documentation is licensed as follows:


charset-normalizer
3.4.3
3.4.4
MIT
MIT License

Expand Down Expand Up @@ -5100,11 +5100,11 @@ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.


idna
3.10
BSD License
3.11
UNKNOWN
BSD 3-Clause License

Copyright (c) 2013-2024, Kim Davies and contributors.
Copyright (c) 2013-2025, Kim Davies and contributors.
All rights reserved.

Redistribution and use in source and binary forms, with or without
Expand Down Expand Up @@ -5977,7 +5977,7 @@ BSD
UNKNOWN

propcache
0.4.0
0.4.1
Apache Software License

Apache License
Expand Down
50 changes: 47 additions & 3 deletions connectors/sources/sharepoint_online.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,19 @@
WILDCARD = "*"
DRIVE_ITEMS_FIELDS = "id,content.downloadUrl,lastModifiedDateTime,lastModifiedBy,root,deleted,file,folder,package,name,webUrl,createdBy,createdDateTime,size,parentReference"

# Exclude specific SharePoint paths entirely at the connector level (pre sync-rules)
EXCLUDED_SHAREPOINT_PATH_SEGMENTS = ["/contentstorage/"]


def _is_excluded_sharepoint_url(url: str) -> bool:
try:
return any(
segment in url.lower() for segment in EXCLUDED_SHAREPOINT_PATH_SEGMENTS
)
except Exception:
return False


CURSOR_SITE_DRIVE_KEY = "site_drives"

# Microsoft Graph API Delta constants
Expand Down Expand Up @@ -784,6 +797,11 @@ async def sites(
if allowed_root_sites == [WILDCARD] or enumerate_all_sites:
self._logger.debug(f"Looking up all sites to fetch: {allowed_root_sites}")
async for site in self._all_sites(sharepoint_host, allowed_root_sites):
if _is_excluded_sharepoint_url(site.get("webUrl", "")):
self._logger.debug(
f"Skipping excluded SharePoint site: {site.get('webUrl', site.get('id', 'unknown'))}"
)
continue
yield site
else:
self._logger.debug(f"Looking up individual sites: {allowed_root_sites}")
Expand All @@ -793,9 +811,20 @@ async def sites(
async for site in self._fetch_site_and_subsites_by_path(
sharepoint_host, allowed_site
):
if _is_excluded_sharepoint_url(site.get("webUrl", "")):
self._logger.debug(
f"Skipping excluded SharePoint site: {site.get('webUrl', site.get('id', 'unknown'))}"
)
continue
yield site
else:
yield await self._fetch_site(sharepoint_host, allowed_site)
site_obj = await self._fetch_site(sharepoint_host, allowed_site)
if _is_excluded_sharepoint_url(site_obj.get("webUrl", "")):
self._logger.debug(
f"Skipping excluded SharePoint site: {site_obj.get('webUrl', site_obj.get('id', 'unknown'))}"
)
continue
yield site_obj

except NotFound:
self._logger.warning(
Expand Down Expand Up @@ -852,8 +881,17 @@ async def _scroll_subsites_by_parent_id(self, parent_site_id):
async def _recurse_sites(self, site_with_subsites):
subsites = site_with_subsites.pop("sites", [])
site_with_subsites.pop("[email protected]", None) # remove unnecessary field
yield site_with_subsites
if subsites:

is_excluded = _is_excluded_sharepoint_url(site_with_subsites.get("webUrl", ""))

if is_excluded:
self._logger.debug(
f"Skipping excluded SharePoint site: {site_with_subsites.get('webUrl', site_with_subsites.get('id', 'unknown'))}"
)
else:
yield site_with_subsites

if subsites and not is_excluded:
async for site in self._scroll_subsites_by_parent_id(
site_with_subsites["id"]
):
Expand Down Expand Up @@ -1113,6 +1151,12 @@ def _validate_sharepoint_rest_url(self, url):
if "OVERRIDE_URL" in os.environ:
return

# Exclude SharePoint Content Storage endpoints entirely
# These URLs are internal and should not be crawled by the connector
if _is_excluded_sharepoint_url(url):
# Silently return to let callers that explicitly skip excluded URLs proceed
return

# I haven't found a better way to validate tenant name for now.
actual_tenant_name = self._tenant_name_pattern.findall(url)[0]

Expand Down