fix: Exclude /contentstorage/ URLs from Sharepoint Online Connector (#3630)

maxesse · artem-shelkovnikov · web-flow · commit 6290525a05d9 · 2025-10-16T15:54:08.000+02:00
## Closes #3603 The change excludes specific /contentstorage/ urls from the sync in all API calls to Sharepoint. These URLs should not be attempted to be accessed as they're created internally by Sharepoint for Teams private channels, loop components, etc. (it's fairly undocumented what they're used for to be honest), and have a different permission model that will cause 401 errors and the connector to stop syncing. ## Checklists #### Pre-Review Checklist - [x] this PR does NOT contain credentials of any kind, such as API keys or username/passwords (double check `config.yml.example`) - [x] this PR has a meaningful title - [x] this PR links to all relevant github issues that it fixes or partially addresses - [x] this PR has a thorough description - [x] Tested the changes locally - [x] For bugfixes: backport safely to all minor branches still receiving patch releases ## Release Note Fixes an issue where a Sharepoint Online sync configured to crawl the entire tenant by selecting * in the site list, might stop with 401 errors when trying to access URLs containing /contentstorage/. --------- Co-authored-by: Artem Shelkovnikov <artem.shelkovnikov@elastic.co>
diff --git a/connectors/sources/sharepoint_online.py b/connectors/sources/sharepoint_online.py
@@ -75,6 +75,19 @@
 WILDCARD = "*"
 DRIVE_ITEMS_FIELDS = "id,content.downloadUrl,lastModifiedDateTime,lastModifiedBy,root,deleted,file,folder,package,name,webUrl,createdBy,createdDateTime,size,parentReference"
 
+# Exclude specific SharePoint paths entirely at the connector level (pre sync-rules)
+EXCLUDED_SHAREPOINT_PATH_SEGMENTS = ["/contentstorage/"]
+
+
+def _is_excluded_sharepoint_url(url: str) -> bool:
+    try:
+        return any(
+            segment in url.lower() for segment in EXCLUDED_SHAREPOINT_PATH_SEGMENTS
+        )
+    except Exception:
+        return False
+
+
 CURSOR_SITE_DRIVE_KEY = "site_drives"
 
 # Microsoft Graph API Delta constants
@@ -784,6 +797,11 @@ async def sites(
         if allowed_root_sites == [WILDCARD] or enumerate_all_sites:
             self._logger.debug(f"Looking up all sites to fetch: {allowed_root_sites}")
             async for site in self._all_sites(sharepoint_host, allowed_root_sites):
+                if _is_excluded_sharepoint_url(site.get("webUrl", "")):
+                    self._logger.debug(
+                        f"Skipping excluded SharePoint site: {site.get('webUrl', site.get('id', 'unknown'))}"
+                    )
+                    continue
                 yield site
         else:
             self._logger.debug(f"Looking up individual sites: {allowed_root_sites}")
@@ -793,9 +811,20 @@ async def sites(
                         async for site in self._fetch_site_and_subsites_by_path(
                             sharepoint_host, allowed_site
                         ):
+                            if _is_excluded_sharepoint_url(site.get("webUrl", "")):
+                                self._logger.debug(
+                                    f"Skipping excluded SharePoint site: {site.get('webUrl', site.get('id', 'unknown'))}"
+                                )
+                                continue
                             yield site
                     else:
-                        yield await self._fetch_site(sharepoint_host, allowed_site)
+                        site_obj = await self._fetch_site(sharepoint_host, allowed_site)
+                        if _is_excluded_sharepoint_url(site_obj.get("webUrl", "")):
+                            self._logger.debug(
+                                f"Skipping excluded SharePoint site: {site_obj.get('webUrl', site_obj.get('id', 'unknown'))}"
+                            )
+                            continue
+                        yield site_obj
 
                 except NotFound:
                     self._logger.warning(
@@ -852,8 +881,17 @@ async def _scroll_subsites_by_parent_id(self, parent_site_id):
     async def _recurse_sites(self, site_with_subsites):
         subsites = site_with_subsites.pop("sites", [])
         site_with_subsites.pop("sites@odata.context", None)  # remove unnecessary field
-        yield site_with_subsites
-        if subsites:
+
+        is_excluded = _is_excluded_sharepoint_url(site_with_subsites.get("webUrl", ""))
+
+        if is_excluded:
+            self._logger.debug(
+                f"Skipping excluded SharePoint site: {site_with_subsites.get('webUrl', site_with_subsites.get('id', 'unknown'))}"
+            )
+        else:
+            yield site_with_subsites
+
+        if subsites and not is_excluded:
             async for site in self._scroll_subsites_by_parent_id(
                 site_with_subsites["id"]
             ):
@@ -1113,6 +1151,12 @@ def _validate_sharepoint_rest_url(self, url):
         if "OVERRIDE_URL" in os.environ:
             return
 
+        # Exclude SharePoint Content Storage endpoints entirely
+        # These URLs are internal and should not be crawled by the connector
+        if _is_excluded_sharepoint_url(url):
+            # Silently return to let callers that explicitly skip excluded URLs proceed
+            return
+
         # I haven't found a better way to validate tenant name for now.
         actual_tenant_name = self._tenant_name_pattern.findall(url)[0]