diff --git a/connectors/sources/sharepoint_online.py b/connectors/sources/sharepoint_online.py index 7adbf3450..b8cc3fa6f 100644 --- a/connectors/sources/sharepoint_online.py +++ b/connectors/sources/sharepoint_online.py @@ -75,6 +75,19 @@ WILDCARD = "*" DRIVE_ITEMS_FIELDS = "id,content.downloadUrl,lastModifiedDateTime,lastModifiedBy,root,deleted,file,folder,package,name,webUrl,createdBy,createdDateTime,size,parentReference" +# Exclude specific SharePoint paths entirely at the connector level (pre sync-rules) +EXCLUDED_SHAREPOINT_PATH_SEGMENTS = ["/contentstorage/"] + + +def _is_excluded_sharepoint_url(url: str) -> bool: + try: + return any( + segment in url.lower() for segment in EXCLUDED_SHAREPOINT_PATH_SEGMENTS + ) + except Exception: + return False + + CURSOR_SITE_DRIVE_KEY = "site_drives" # Microsoft Graph API Delta constants @@ -784,6 +797,11 @@ async def sites( if allowed_root_sites == [WILDCARD] or enumerate_all_sites: self._logger.debug(f"Looking up all sites to fetch: {allowed_root_sites}") async for site in self._all_sites(sharepoint_host, allowed_root_sites): + if _is_excluded_sharepoint_url(site.get("webUrl", "")): + self._logger.debug( + f"Skipping excluded SharePoint site: {site.get('webUrl', site.get('id', 'unknown'))}" + ) + continue yield site else: self._logger.debug(f"Looking up individual sites: {allowed_root_sites}") @@ -793,9 +811,20 @@ async def sites( async for site in self._fetch_site_and_subsites_by_path( sharepoint_host, allowed_site ): + if _is_excluded_sharepoint_url(site.get("webUrl", "")): + self._logger.debug( + f"Skipping excluded SharePoint site: {site.get('webUrl', site.get('id', 'unknown'))}" + ) + continue yield site else: - yield await self._fetch_site(sharepoint_host, allowed_site) + site_obj = await self._fetch_site(sharepoint_host, allowed_site) + if _is_excluded_sharepoint_url(site_obj.get("webUrl", "")): + self._logger.debug( + f"Skipping excluded SharePoint site: {site_obj.get('webUrl', site_obj.get('id', 'unknown'))}" + ) + continue + yield site_obj except NotFound: self._logger.warning( @@ -852,8 +881,17 @@ async def _scroll_subsites_by_parent_id(self, parent_site_id): async def _recurse_sites(self, site_with_subsites): subsites = site_with_subsites.pop("sites", []) site_with_subsites.pop("sites@odata.context", None) # remove unnecessary field - yield site_with_subsites - if subsites: + + is_excluded = _is_excluded_sharepoint_url(site_with_subsites.get("webUrl", "")) + + if is_excluded: + self._logger.debug( + f"Skipping excluded SharePoint site: {site_with_subsites.get('webUrl', site_with_subsites.get('id', 'unknown'))}" + ) + else: + yield site_with_subsites + + if subsites and not is_excluded: async for site in self._scroll_subsites_by_parent_id( site_with_subsites["id"] ): @@ -1113,6 +1151,12 @@ def _validate_sharepoint_rest_url(self, url): if "OVERRIDE_URL" in os.environ: return + # Exclude SharePoint Content Storage endpoints entirely + # These URLs are internal and should not be crawled by the connector + if _is_excluded_sharepoint_url(url): + # Silently return to let callers that explicitly skip excluded URLs proceed + return + # I haven't found a better way to validate tenant name for now. actual_tenant_name = self._tenant_name_pattern.findall(url)[0]