elastic · maxesse · Aug 20, 2025 · Oct 6, 2025
@@ -75,6 +75,19 @@
 WILDCARD = "*"
 DRIVE_ITEMS_FIELDS = "id,content.downloadUrl,lastModifiedDateTime,lastModifiedBy,root,deleted,file,folder,package,name,webUrl,createdBy,createdDateTime,size,parentReference"
 
+# Exclude specific SharePoint paths entirely at the connector level (pre sync-rules)
+EXCLUDED_SHAREPOINT_PATH_SEGMENTS = ["/contentstorage/"]
+
+
+def _is_excluded_sharepoint_url(url: str) -> bool:
+    try:
+        return any(
+            segment in url.lower() for segment in EXCLUDED_SHAREPOINT_PATH_SEGMENTS
+        )
+    except Exception:
+        return False
+
+
 CURSOR_SITE_DRIVE_KEY = "site_drives"
 
 # Microsoft Graph API Delta constants
@@ -784,6 +797,11 @@ async def sites(
         if allowed_root_sites == [WILDCARD] or enumerate_all_sites:
             self._logger.debug(f"Looking up all sites to fetch: {allowed_root_sites}")
             async for site in self._all_sites(sharepoint_host, allowed_root_sites):
+                if _is_excluded_sharepoint_url(site.get("webUrl", "")):
+                    self._logger.debug(
+                        f"Skipping excluded SharePoint site: {site.get('webUrl', site.get('id', 'unknown'))}"
+                    )
+                    continue
                 yield site
         else:
             self._logger.debug(f"Looking up individual sites: {allowed_root_sites}")
@@ -793,9 +811,20 @@ async def sites(
                         async for site in self._fetch_site_and_subsites_by_path(
                             sharepoint_host, allowed_site
                         ):
+                            if _is_excluded_sharepoint_url(site.get("webUrl", "")):
+                                self._logger.debug(
+                                    f"Skipping excluded SharePoint site: {site.get('webUrl', site.get('id', 'unknown'))}"
+                                )
+                                continue
                             yield site
                     else:
-                        yield await self._fetch_site(sharepoint_host, allowed_site)
+                        site_obj = await self._fetch_site(sharepoint_host, allowed_site)
+                        if _is_excluded_sharepoint_url(site_obj.get("webUrl", "")):
+                            self._logger.debug(
+                                f"Skipping excluded SharePoint site: {site_obj.get('webUrl', site_obj.get('id', 'unknown'))}"
+                            )
+                            continue
+                        yield site_obj
 
                 except NotFound:
                     self._logger.warning(
@@ -852,8 +881,17 @@ async def _scroll_subsites_by_parent_id(self, parent_site_id):
     async def _recurse_sites(self, site_with_subsites):
         subsites = site_with_subsites.pop("sites", [])
         site_with_subsites.pop("[email protected]", None)  # remove unnecessary field
-        yield site_with_subsites
-        if subsites:
+
+        is_excluded = _is_excluded_sharepoint_url(site_with_subsites.get("webUrl", ""))
+
+        if is_excluded:
+            self._logger.debug(
+                f"Skipping excluded SharePoint site: {site_with_subsites.get('webUrl', site_with_subsites.get('id', 'unknown'))}"
+            )
+        else:
+            yield site_with_subsites
+
+        if subsites and not is_excluded:
             async for site in self._scroll_subsites_by_parent_id(
                 site_with_subsites["id"]
             ):
@@ -1113,6 +1151,12 @@ def _validate_sharepoint_rest_url(self, url):
         if "OVERRIDE_URL" in os.environ:
             return
 
+        # Exclude SharePoint Content Storage endpoints entirely
+        # These URLs are internal and should not be crawled by the connector
+        if _is_excluded_sharepoint_url(url):
+            # Silently return to let callers that explicitly skip excluded URLs proceed
+            return
+
         # I haven't found a better way to validate tenant name for now.
         actual_tenant_name = self._tenant_name_pattern.findall(url)[0]