diff --git a/NOTICE.txt b/NOTICE.txt index a12779a8a..aa9c34c2b 100644 --- a/NOTICE.txt +++ b/NOTICE.txt @@ -1880,7 +1880,7 @@ SOFTWARE. azure-core -1.35.1 +1.36.0 MIT License Copyright (c) Microsoft Corporation. @@ -2606,7 +2606,7 @@ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND cachetools -6.2.0 +6.2.1 MIT License The MIT License (MIT) @@ -2687,7 +2687,7 @@ documentation is licensed as follows: charset-normalizer -3.4.3 +3.4.4 MIT MIT License @@ -5100,11 +5100,11 @@ OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. idna -3.10 -BSD License +3.11 +UNKNOWN BSD 3-Clause License -Copyright (c) 2013-2024, Kim Davies and contributors. +Copyright (c) 2013-2025, Kim Davies and contributors. All rights reserved. Redistribution and use in source and binary forms, with or without @@ -5977,7 +5977,7 @@ BSD UNKNOWN propcache -0.4.0 +0.4.1 Apache Software License Apache License diff --git a/connectors/sources/sharepoint_online.py b/connectors/sources/sharepoint_online.py index 7adbf3450..b8cc3fa6f 100644 --- a/connectors/sources/sharepoint_online.py +++ b/connectors/sources/sharepoint_online.py @@ -75,6 +75,19 @@ WILDCARD = "*" DRIVE_ITEMS_FIELDS = "id,content.downloadUrl,lastModifiedDateTime,lastModifiedBy,root,deleted,file,folder,package,name,webUrl,createdBy,createdDateTime,size,parentReference" +# Exclude specific SharePoint paths entirely at the connector level (pre sync-rules) +EXCLUDED_SHAREPOINT_PATH_SEGMENTS = ["/contentstorage/"] + + +def _is_excluded_sharepoint_url(url: str) -> bool: + try: + return any( + segment in url.lower() for segment in EXCLUDED_SHAREPOINT_PATH_SEGMENTS + ) + except Exception: + return False + + CURSOR_SITE_DRIVE_KEY = "site_drives" # Microsoft Graph API Delta constants @@ -784,6 +797,11 @@ async def sites( if allowed_root_sites == [WILDCARD] or enumerate_all_sites: self._logger.debug(f"Looking up all sites to fetch: {allowed_root_sites}") async for site in self._all_sites(sharepoint_host, allowed_root_sites): + if _is_excluded_sharepoint_url(site.get("webUrl", "")): + self._logger.debug( + f"Skipping excluded SharePoint site: {site.get('webUrl', site.get('id', 'unknown'))}" + ) + continue yield site else: self._logger.debug(f"Looking up individual sites: {allowed_root_sites}") @@ -793,9 +811,20 @@ async def sites( async for site in self._fetch_site_and_subsites_by_path( sharepoint_host, allowed_site ): + if _is_excluded_sharepoint_url(site.get("webUrl", "")): + self._logger.debug( + f"Skipping excluded SharePoint site: {site.get('webUrl', site.get('id', 'unknown'))}" + ) + continue yield site else: - yield await self._fetch_site(sharepoint_host, allowed_site) + site_obj = await self._fetch_site(sharepoint_host, allowed_site) + if _is_excluded_sharepoint_url(site_obj.get("webUrl", "")): + self._logger.debug( + f"Skipping excluded SharePoint site: {site_obj.get('webUrl', site_obj.get('id', 'unknown'))}" + ) + continue + yield site_obj except NotFound: self._logger.warning( @@ -852,8 +881,17 @@ async def _scroll_subsites_by_parent_id(self, parent_site_id): async def _recurse_sites(self, site_with_subsites): subsites = site_with_subsites.pop("sites", []) site_with_subsites.pop("sites@odata.context", None) # remove unnecessary field - yield site_with_subsites - if subsites: + + is_excluded = _is_excluded_sharepoint_url(site_with_subsites.get("webUrl", "")) + + if is_excluded: + self._logger.debug( + f"Skipping excluded SharePoint site: {site_with_subsites.get('webUrl', site_with_subsites.get('id', 'unknown'))}" + ) + else: + yield site_with_subsites + + if subsites and not is_excluded: async for site in self._scroll_subsites_by_parent_id( site_with_subsites["id"] ): @@ -1113,6 +1151,12 @@ def _validate_sharepoint_rest_url(self, url): if "OVERRIDE_URL" in os.environ: return + # Exclude SharePoint Content Storage endpoints entirely + # These URLs are internal and should not be crawled by the connector + if _is_excluded_sharepoint_url(url): + # Silently return to let callers that explicitly skip excluded URLs proceed + return + # I haven't found a better way to validate tenant name for now. actual_tenant_name = self._tenant_name_pattern.findall(url)[0]