Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 47 additions & 3 deletions connectors/sources/sharepoint_online.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,19 @@
WILDCARD = "*"
DRIVE_ITEMS_FIELDS = "id,content.downloadUrl,lastModifiedDateTime,lastModifiedBy,root,deleted,file,folder,package,name,webUrl,createdBy,createdDateTime,size,parentReference"

# Exclude specific SharePoint paths entirely at the connector level (pre sync-rules)
EXCLUDED_SHAREPOINT_PATH_SEGMENTS = ["/contentstorage/"]


def _is_excluded_sharepoint_url(url: str) -> bool:
try:
return any(
segment in url.lower() for segment in EXCLUDED_SHAREPOINT_PATH_SEGMENTS
)
except Exception:
return False


CURSOR_SITE_DRIVE_KEY = "site_drives"

# Microsoft Graph API Delta constants
Expand Down Expand Up @@ -784,6 +797,11 @@ async def sites(
if allowed_root_sites == [WILDCARD] or enumerate_all_sites:
self._logger.debug(f"Looking up all sites to fetch: {allowed_root_sites}")
async for site in self._all_sites(sharepoint_host, allowed_root_sites):
if _is_excluded_sharepoint_url(site.get("webUrl", "")):
self._logger.debug(
f"Skipping excluded SharePoint site: {site.get('webUrl', site.get('id', 'unknown'))}"
)
continue
yield site
else:
self._logger.debug(f"Looking up individual sites: {allowed_root_sites}")
Expand All @@ -793,9 +811,20 @@ async def sites(
async for site in self._fetch_site_and_subsites_by_path(
sharepoint_host, allowed_site
):
if _is_excluded_sharepoint_url(site.get("webUrl", "")):
self._logger.debug(
f"Skipping excluded SharePoint site: {site.get('webUrl', site.get('id', 'unknown'))}"
)
continue
yield site
else:
yield await self._fetch_site(sharepoint_host, allowed_site)
site_obj = await self._fetch_site(sharepoint_host, allowed_site)
if _is_excluded_sharepoint_url(site_obj.get("webUrl", "")):
self._logger.debug(
f"Skipping excluded SharePoint site: {site_obj.get('webUrl', site_obj.get('id', 'unknown'))}"
)
continue
yield site_obj

except NotFound:
self._logger.warning(
Expand Down Expand Up @@ -852,8 +881,17 @@ async def _scroll_subsites_by_parent_id(self, parent_site_id):
async def _recurse_sites(self, site_with_subsites):
subsites = site_with_subsites.pop("sites", [])
site_with_subsites.pop("[email protected]", None) # remove unnecessary field
yield site_with_subsites
if subsites:

is_excluded = _is_excluded_sharepoint_url(site_with_subsites.get("webUrl", ""))

if is_excluded:
self._logger.debug(
f"Skipping excluded SharePoint site: {site_with_subsites.get('webUrl', site_with_subsites.get('id', 'unknown'))}"
)
else:
yield site_with_subsites

if subsites and not is_excluded:
async for site in self._scroll_subsites_by_parent_id(
site_with_subsites["id"]
):
Expand Down Expand Up @@ -1113,6 +1151,12 @@ def _validate_sharepoint_rest_url(self, url):
if "OVERRIDE_URL" in os.environ:
return

# Exclude SharePoint Content Storage endpoints entirely
# These URLs are internal and should not be crawled by the connector
if _is_excluded_sharepoint_url(url):
# Silently return to let callers that explicitly skip excluded URLs proceed
return

# I haven't found a better way to validate tenant name for now.
actual_tenant_name = self._tenant_name_pattern.findall(url)[0]

Expand Down