Skip to content

Commit 6290525

Browse files
fix: Exclude /contentstorage/ URLs from Sharepoint Online Connector (#3630)
## Closes #3603 The change excludes specific /contentstorage/ urls from the sync in all API calls to Sharepoint. These URLs should not be attempted to be accessed as they're created internally by Sharepoint for Teams private channels, loop components, etc. (it's fairly undocumented what they're used for to be honest), and have a different permission model that will cause 401 errors and the connector to stop syncing. ## Checklists #### Pre-Review Checklist - [x] this PR does NOT contain credentials of any kind, such as API keys or username/passwords (double check `config.yml.example`) - [x] this PR has a meaningful title - [x] this PR links to all relevant github issues that it fixes or partially addresses - [x] this PR has a thorough description - [x] Tested the changes locally - [x] For bugfixes: backport safely to all minor branches still receiving patch releases ## Release Note Fixes an issue where a Sharepoint Online sync configured to crawl the entire tenant by selecting * in the site list, might stop with 401 errors when trying to access URLs containing /contentstorage/. --------- Co-authored-by: Artem Shelkovnikov <[email protected]>
1 parent b89e3d0 commit 6290525

File tree

1 file changed

+47
-3
lines changed

1 file changed

+47
-3
lines changed

connectors/sources/sharepoint_online.py

Lines changed: 47 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,19 @@
7575
WILDCARD = "*"
7676
DRIVE_ITEMS_FIELDS = "id,content.downloadUrl,lastModifiedDateTime,lastModifiedBy,root,deleted,file,folder,package,name,webUrl,createdBy,createdDateTime,size,parentReference"
7777

78+
# Exclude specific SharePoint paths entirely at the connector level (pre sync-rules)
79+
EXCLUDED_SHAREPOINT_PATH_SEGMENTS = ["/contentstorage/"]
80+
81+
82+
def _is_excluded_sharepoint_url(url: str) -> bool:
83+
try:
84+
return any(
85+
segment in url.lower() for segment in EXCLUDED_SHAREPOINT_PATH_SEGMENTS
86+
)
87+
except Exception:
88+
return False
89+
90+
7891
CURSOR_SITE_DRIVE_KEY = "site_drives"
7992

8093
# Microsoft Graph API Delta constants
@@ -784,6 +797,11 @@ async def sites(
784797
if allowed_root_sites == [WILDCARD] or enumerate_all_sites:
785798
self._logger.debug(f"Looking up all sites to fetch: {allowed_root_sites}")
786799
async for site in self._all_sites(sharepoint_host, allowed_root_sites):
800+
if _is_excluded_sharepoint_url(site.get("webUrl", "")):
801+
self._logger.debug(
802+
f"Skipping excluded SharePoint site: {site.get('webUrl', site.get('id', 'unknown'))}"
803+
)
804+
continue
787805
yield site
788806
else:
789807
self._logger.debug(f"Looking up individual sites: {allowed_root_sites}")
@@ -793,9 +811,20 @@ async def sites(
793811
async for site in self._fetch_site_and_subsites_by_path(
794812
sharepoint_host, allowed_site
795813
):
814+
if _is_excluded_sharepoint_url(site.get("webUrl", "")):
815+
self._logger.debug(
816+
f"Skipping excluded SharePoint site: {site.get('webUrl', site.get('id', 'unknown'))}"
817+
)
818+
continue
796819
yield site
797820
else:
798-
yield await self._fetch_site(sharepoint_host, allowed_site)
821+
site_obj = await self._fetch_site(sharepoint_host, allowed_site)
822+
if _is_excluded_sharepoint_url(site_obj.get("webUrl", "")):
823+
self._logger.debug(
824+
f"Skipping excluded SharePoint site: {site_obj.get('webUrl', site_obj.get('id', 'unknown'))}"
825+
)
826+
continue
827+
yield site_obj
799828

800829
except NotFound:
801830
self._logger.warning(
@@ -852,8 +881,17 @@ async def _scroll_subsites_by_parent_id(self, parent_site_id):
852881
async def _recurse_sites(self, site_with_subsites):
853882
subsites = site_with_subsites.pop("sites", [])
854883
site_with_subsites.pop("[email protected]", None) # remove unnecessary field
855-
yield site_with_subsites
856-
if subsites:
884+
885+
is_excluded = _is_excluded_sharepoint_url(site_with_subsites.get("webUrl", ""))
886+
887+
if is_excluded:
888+
self._logger.debug(
889+
f"Skipping excluded SharePoint site: {site_with_subsites.get('webUrl', site_with_subsites.get('id', 'unknown'))}"
890+
)
891+
else:
892+
yield site_with_subsites
893+
894+
if subsites and not is_excluded:
857895
async for site in self._scroll_subsites_by_parent_id(
858896
site_with_subsites["id"]
859897
):
@@ -1113,6 +1151,12 @@ def _validate_sharepoint_rest_url(self, url):
11131151
if "OVERRIDE_URL" in os.environ:
11141152
return
11151153

1154+
# Exclude SharePoint Content Storage endpoints entirely
1155+
# These URLs are internal and should not be crawled by the connector
1156+
if _is_excluded_sharepoint_url(url):
1157+
# Silently return to let callers that explicitly skip excluded URLs proceed
1158+
return
1159+
11161160
# I haven't found a better way to validate tenant name for now.
11171161
actual_tenant_name = self._tenant_name_pattern.findall(url)[0]
11181162

0 commit comments

Comments
 (0)