awslabs
diff --git a/‎src/aws-healthomics-mcp-server/README.md‎
Lines changed: 6 additions & 2 deletions b/‎src/aws-healthomics-mcp-server/README.md‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎src/aws-healthomics-mcp-server/awslabs/aws_healthomics_mcp_server/consts.py‎
Lines changed: 80 additions & 0 deletions b/‎src/aws-healthomics-mcp-server/awslabs/aws_healthomics_mcp_server/consts.py‎
Lines changed: 80 additions & 0 deletions
diff --git a/‎src/aws-healthomics-mcp-server/awslabs/aws_healthomics_mcp_server/models/search.py‎
Lines changed: 6 additions & 0 deletions b/‎src/aws-healthomics-mcp-server/awslabs/aws_healthomics_mcp_server/models/search.py‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎src/aws-healthomics-mcp-server/awslabs/aws_healthomics_mcp_server/search/genomics_search_orchestrator.py‎
Lines changed: 93 additions & 15 deletions b/‎src/aws-healthomics-mcp-server/awslabs/aws_healthomics_mcp_server/search/genomics_search_orchestrator.py‎
Lines changed: 93 additions & 15 deletions
@@ -401,6 +401,8 @@ uv run -m awslabs.aws_healthomics_mcp_server.server
 - `GENOMICS_SEARCH_TIMEOUT_SECONDS` - Search timeout in seconds (default: 300)
 - `GENOMICS_SEARCH_ENABLE_HEALTHOMICS` - Enable/disable HealthOmics sequence/reference store searches (default: true)
 
+> **Note for Large S3 Buckets**: When searching very large S3 buckets (millions of objects), the genomics file search may take longer than the default MCP client timeout. If you encounter timeout errors, increase the MCP server timeout by adding a `"timeout"` property to your MCP server configuration (e.g., `"timeout": 300000` for five minutes, specified in milliseconds). This is particularly important when using the search tool with extensive S3 bucket configurations or when `GENOMICS_SEARCH_ENABLE_S3_TAG_SEARCH=true` is used with large datasets. The value of `"timeout"` should always be greater than the value of `GENOMICS_SEARCH_TIMEOUT_SECONDS` if you want to prevent the MCP timeout from preempting the genomics search timeout
+
 #### Testing Configuration Variables
 
 The following environment variables are primarily intended for testing scenarios, such as integration testing against mock service endpoints:
@@ -516,6 +518,7 @@ Add to your Claude Desktop configuration:
     "aws-healthomics": {
       "command": "uvx",
       "args": ["awslabs.aws-healthomics-mcp-server"],
+      "timeout": 300000,
       "env": {
         "AWS_REGION": "us-east-1",
         "AWS_PROFILE": "your-profile",
@@ -541,6 +544,7 @@ For integration testing against mock services:
     "aws-healthomics-test": {
       "command": "uvx",
       "args": ["awslabs.aws-healthomics-mcp-server"],
+      "timeout": 300000,
       "env": {
         "AWS_REGION": "us-east-1",
         "AWS_PROFILE": "test-profile",
@@ -572,7 +576,7 @@ For Windows users, the MCP server configuration format is slightly different:
   "mcpServers": {
     "awslabs.aws-healthomics-mcp-server": {
       "disabled": false,
-      "timeout": 60,
+      "timeout": 300000,
       "type": "stdio",
       "command": "uv",
       "args": [
@@ -606,7 +610,7 @@ For testing scenarios on Windows:
   "mcpServers": {
     "awslabs.aws-healthomics-mcp-server-test": {
       "disabled": false,
-      "timeout": 60,
+      "timeout": 300000,
       "type": "stdio",
       "command": "uv",
       "args": [
 
@@ -92,6 +92,86 @@
 DEFAULT_GENOMICS_SEARCH_RESULT_CACHE_TTL = 600
 DEFAULT_GENOMICS_SEARCH_TAG_CACHE_TTL = 300
 
+# Cache size limits - Maximum number of entries in the cache
+DEFAULT_GENOMICS_SEARCH_MAX_FILE_CACHE_SIZE = 10000
+DEFAULT_GENOMICS_SEARCH_MAX_TAG_CACHE_SIZE = 1000
+DEFAULT_GENOMICS_SEARCH_MAX_RESULT_CACHE_SIZE = 100
+DEFAULT_GENOMICS_SEARCH_MAX_PAGINATION_CACHE_SIZE = 50
+
+# Cache cleanup behavior
+DEFAULT_CACHE_CLEANUP_KEEP_RATIO = 0.8  # Keep at most 80% of entries when cleaning up by size
+
+# Search limits and pagination
+MAX_SEARCH_RESULTS_LIMIT = 10000  # Maximum allowed results per search
+DEFAULT_HEALTHOMICS_PAGE_SIZE = 100  # Default pagination size for HealthOmics APIs
+DEFAULT_S3_PAGE_SIZE = 1000  # Default pagination size for S3 operations
+DEFAULT_RESULT_RANKER_FALLBACK_SIZE = 100  # Fallback size when max_results is invalid
+
+# Rate limiting and performance
+HEALTHOMICS_RATE_LIMIT_DELAY = 0.1  # Sleep delay between HealthOmics Storage API calls (10 TPS)
+
+# Cache cleanup sweep probabilities for entries with expired TTLs (as percentages for clarity)
+PAGINATION_CACHE_CLEANUP_PROBABILITY = 1  # 1% chance (1 in 100)
+S3_CACHE_CLEANUP_PROBABILITY = 2  # 2% chance (1 in 50)
+
+# Buffer size optimization thresholds
+CURSOR_PAGINATION_BUFFER_THRESHOLD = 5000  # Use cursor pagination above this buffer size
+CURSOR_PAGINATION_PAGE_THRESHOLD = 10  # Use cursor pagination above this page number
+BUFFER_EFFICIENCY_LOW_THRESHOLD = 0.1  # 10% efficiency threshold
+BUFFER_EFFICIENCY_HIGH_THRESHOLD = 0.5  # 50% efficiency threshold
+
+# Buffer size complexity multipliers
+COMPLEXITY_MULTIPLIER_FILE_TYPE_FILTER = 0.8  # Reduce complexity when file type is filtered
+COMPLEXITY_MULTIPLIER_ASSOCIATED_FILES = 1.2  # Increase complexity for associated files
+COMPLEXITY_MULTIPLIER_BUFFER_OVERFLOW = 1.5  # Increase when buffer overflows occur
+COMPLEXITY_MULTIPLIER_LOW_EFFICIENCY = 2.0  # Increase when efficiency is low
+COMPLEXITY_MULTIPLIER_HIGH_EFFICIENCY = 0.8  # Decrease when efficiency is high
+
+# Pattern matching thresholds and multipliers
+FUZZY_MATCH_THRESHOLD = 0.6  # Minimum similarity for fuzzy matches
+MULTIPLE_MATCH_BONUS_MULTIPLIER = 1.2  # 20% bonus for multiple pattern matches
+TAG_MATCH_PENALTY_MULTIPLIER = 0.9  # 10% penalty for tag matches vs path matches
+SUBSTRING_MATCH_MAX_MULTIPLIER = 0.8  # Maximum score multiplier for substring matches
+FUZZY_MATCH_MAX_MULTIPLIER = 0.6  # Maximum score multiplier for fuzzy matches
+
+# Match quality score thresholds
+MATCH_QUALITY_EXCELLENT_THRESHOLD = 0.8
+MATCH_QUALITY_GOOD_THRESHOLD = 0.6
+MATCH_QUALITY_FAIR_THRESHOLD = 0.4
+
+# Match quality labels
+MATCH_QUALITY_EXCELLENT = 'excellent'
+MATCH_QUALITY_GOOD = 'good'
+MATCH_QUALITY_FAIR = 'fair'
+MATCH_QUALITY_POOR = 'poor'
+
+# Unit conversion constants
+BYTES_PER_KILOBYTE = 1024
+MILLISECONDS_PER_SECOND = 1000.0
+
+# HealthOmics status constants
+HEALTHOMICS_STATUS_ACTIVE = 'ACTIVE'
+
+# HealthOmics storage class constants
+HEALTHOMICS_STORAGE_CLASS_MANAGED = 'MANAGED'
+
+# Storage tier constants
+STORAGE_TIER_HOT = 'hot'
+STORAGE_TIER_WARM = 'warm'
+STORAGE_TIER_COLD = 'cold'
+STORAGE_TIER_UNKNOWN = 'unknown'
+
+# S3 storage class constants
+S3_STORAGE_CLASS_STANDARD = 'STANDARD'
+S3_STORAGE_CLASS_REDUCED_REDUNDANCY = 'REDUCED_REDUNDANCY'
+S3_STORAGE_CLASS_STANDARD_IA = 'STANDARD_IA'
+S3_STORAGE_CLASS_ONEZONE_IA = 'ONEZONE_IA'
+S3_STORAGE_CLASS_INTELLIGENT_TIERING = 'INTELLIGENT_TIERING'
+S3_STORAGE_CLASS_GLACIER = 'GLACIER'
+S3_STORAGE_CLASS_DEEP_ARCHIVE = 'DEEP_ARCHIVE'
+S3_STORAGE_CLASS_OUTPOSTS = 'OUTPOSTS'
+S3_STORAGE_CLASS_GLACIER_IR = 'GLACIER_IR'
+
 # Error messages
 
 ERROR_INVALID_STORAGE_TYPE = 'Invalid storage type. Must be one of: {}'
 
@@ -194,6 +194,12 @@ class SearchConfig:
     result_cache_ttl_seconds: int = 600  # Result cache TTL (10 minutes)
     tag_cache_ttl_seconds: int = 300  # Tag cache TTL (5 minutes)
 
+    # Cache size limits
+    max_tag_cache_size: int = 1000  # Maximum number of tag cache entries
+    max_result_cache_size: int = 100  # Maximum number of result cache entries
+    max_pagination_cache_size: int = 50  # Maximum number of pagination cache entries
+    cache_cleanup_keep_ratio: float = 0.8  # Ratio of entries to keep during size-based cleanup
+
     # Pagination performance optimization settings
     enable_cursor_based_pagination: bool = (
         True  # Enable cursor-based pagination for large datasets
 
@@ -17,6 +17,19 @@
 import asyncio
 import secrets
 import time
+from awslabs.aws_healthomics_mcp_server.consts import (
+    BUFFER_EFFICIENCY_HIGH_THRESHOLD,
+    BUFFER_EFFICIENCY_LOW_THRESHOLD,
+    COMPLEXITY_MULTIPLIER_ASSOCIATED_FILES,
+    COMPLEXITY_MULTIPLIER_BUFFER_OVERFLOW,
+    COMPLEXITY_MULTIPLIER_FILE_TYPE_FILTER,
+    COMPLEXITY_MULTIPLIER_HIGH_EFFICIENCY,
+    COMPLEXITY_MULTIPLIER_LOW_EFFICIENCY,
+    CURSOR_PAGINATION_BUFFER_THRESHOLD,
+    CURSOR_PAGINATION_PAGE_THRESHOLD,
+    MAX_SEARCH_RESULTS_LIMIT,
+    S3_CACHE_CLEANUP_PROBABILITY,
+)
 from awslabs.aws_healthomics_mcp_server.models import (
     GenomicsFile,
     GenomicsFileResult,
@@ -356,8 +369,10 @@ async def search_paginated(
                 )
                 self._cache_pagination_state(cache_key, cache_entry)
 
-            # Clean up expired cache entries periodically
-            if secrets.randbelow(20) == 0:  # 5% chance to clean up cache
+            # Clean up expired cache entries periodically (reduced frequency due to size-based cleanup)
+            if (
+                secrets.randbelow(100) == 0
+            ):  # Probability defined by PAGINATION_CACHE_CLEANUP_PROBABILITY
                 try:
                     self.cleanup_expired_pagination_cache()
                 except Exception as e:
@@ -424,8 +439,8 @@ def _validate_search_request(self, request: GenomicsFileSearchRequest) -> None:
         if request.max_results <= 0:
             raise ValueError('max_results must be greater than 0')
 
-        if request.max_results > 10000:
-            raise ValueError('max_results cannot exceed 10000')
+        if request.max_results > MAX_SEARCH_RESULTS_LIMIT:
+            raise ValueError(f'max_results cannot exceed {MAX_SEARCH_RESULTS_LIMIT}')
 
         # Validate file_type if provided
         if request.file_type:
@@ -489,10 +504,11 @@ async def _execute_parallel_searches(
             else:
                 logger.warning(f'Unexpected result type from {storage_system}: {type(result)}')
 
-        # Periodically clean up expired cache entries (approximately every 10th search)
+        # Periodically clean up expired cache entries (reduced frequency due to size-based cleanup)
         if (
-            secrets.randbelow(10) == 0 and self.s3_engine is not None
-        ):  # 10% chance to clean up cache
+            secrets.randbelow(100 // S3_CACHE_CLEANUP_PROBABILITY) == 0
+            and self.s3_engine is not None
+        ):  # Probability defined by S3_CACHE_CLEANUP_PROBABILITY
             try:
                 self.s3_engine.cleanup_expired_cache_entries()
             except Exception as e:
@@ -1003,6 +1019,10 @@ def _cache_pagination_state(self, cache_key: str, entry: 'PaginationCacheEntry')
             if not hasattr(self, '_pagination_cache'):
                 self._pagination_cache = {}
 
+            # Check if we need to clean up before adding
+            if len(self._pagination_cache) >= self.config.max_pagination_cache_size:
+                self._cleanup_pagination_cache_by_size()
+
             entry.update_timestamp()
             self._pagination_cache[cache_key] = entry
             logger.debug(f'Cached pagination state for key: {cache_key}')
@@ -1030,26 +1050,26 @@ def _optimize_buffer_size(
 
         # File type filtering reduces complexity
         if request.file_type:
-            complexity_multiplier *= 0.8
+            complexity_multiplier *= COMPLEXITY_MULTIPLIER_FILE_TYPE_FILTER
 
         # Associated files increase complexity
         if request.include_associated_files:
-            complexity_multiplier *= 1.2
+            complexity_multiplier *= COMPLEXITY_MULTIPLIER_ASSOCIATED_FILES
 
         # Adjust based on historical metrics
         if metrics:
             # If we had buffer overflows, increase buffer size
             if metrics.buffer_overflows > 0:
-                complexity_multiplier *= 1.5
+                complexity_multiplier *= COMPLEXITY_MULTIPLIER_BUFFER_OVERFLOW
 
             # If efficiency was low, increase buffer size
             efficiency_ratio = metrics.total_results_fetched / max(
                 metrics.total_objects_scanned, 1
             )
-            if efficiency_ratio < 0.1:  # Less than 10% efficiency
-                complexity_multiplier *= 2.0
-            elif efficiency_ratio > 0.5:  # More than 50% efficiency
-                complexity_multiplier *= 0.8
+            if efficiency_ratio < BUFFER_EFFICIENCY_LOW_THRESHOLD:
+                complexity_multiplier *= COMPLEXITY_MULTIPLIER_LOW_EFFICIENCY
+            elif efficiency_ratio > BUFFER_EFFICIENCY_HIGH_THRESHOLD:
+                complexity_multiplier *= COMPLEXITY_MULTIPLIER_HIGH_EFFICIENCY
 
         optimized_size = int(base_buffer_size * complexity_multiplier)
 
@@ -1098,9 +1118,63 @@ def _should_use_cursor_pagination(
         """
         # Use cursor pagination for large buffer sizes or high page numbers
         return self.config.enable_cursor_based_pagination and (
-            request.pagination_buffer_size > 5000 or global_token.page_number > 10
+            request.pagination_buffer_size > CURSOR_PAGINATION_BUFFER_THRESHOLD
+            or global_token.page_number > CURSOR_PAGINATION_PAGE_THRESHOLD
+        )
+
+    def _cleanup_pagination_cache_by_size(self) -> None:
+        """Clean up pagination cache when it exceeds max size, prioritizing expired entries first.
+
+        Strategy:
+        1. First: Remove all expired entries (regardless of age)
+        2. Then: If still over size limit, remove oldest non-expired entries
+        """
+        if not hasattr(self, '_pagination_cache'):
+            return
+
+        if len(self._pagination_cache) < self.config.max_pagination_cache_size:
+            return
+
+        target_size = int(
+            self.config.max_pagination_cache_size * self.config.cache_cleanup_keep_ratio
         )
 
+        # Separate expired and valid entries
+        expired_items = []
+        valid_items = []
+
+        for key, entry in self._pagination_cache.items():
+            if entry.is_expired(self.config.pagination_cache_ttl_seconds):
+                expired_items.append((key, entry))
+            else:
+                valid_items.append((key, entry))
+
+        # Phase 1: Remove all expired items first
+        expired_count = len(expired_items)
+        for key, _ in expired_items:
+            del self._pagination_cache[key]
+
+        # Phase 2: If still over target size, remove oldest valid items
+        remaining_count = len(self._pagination_cache)
+        additional_removals = 0
+
+        if remaining_count > target_size:
+            # Sort valid items by timestamp (oldest first)
+            valid_items.sort(key=lambda x: x[1].timestamp)
+            additional_to_remove = remaining_count - target_size
+
+            for i in range(min(additional_to_remove, len(valid_items))):
+                key, _ = valid_items[i]
+                if key in self._pagination_cache:  # Double-check key still exists
+                    del self._pagination_cache[key]
+                    additional_removals += 1
+
+        total_removed = expired_count + additional_removals
+        if total_removed > 0:
+            logger.debug(
+                f'Smart pagination cache cleanup: removed {expired_count} expired + {additional_removals} oldest valid = {total_removed} total entries, {len(self._pagination_cache)} remaining'
+            )
+
     def cleanup_expired_pagination_cache(self) -> None:
         """Clean up expired pagination cache entries to prevent memory leaks."""
         if not hasattr(self, '_pagination_cache'):
@@ -1136,10 +1210,14 @@ def get_pagination_cache_stats(self) -> Dict[str, Any]:
             'total_entries': len(self._pagination_cache),
             'valid_entries': valid_entries,
             'ttl_seconds': self.config.pagination_cache_ttl_seconds,
+            'max_cache_size': self.config.max_pagination_cache_size,
+            'cache_utilization': len(self._pagination_cache)
+            / self.config.max_pagination_cache_size,
             'config': {
                 'enable_cursor_pagination': self.config.enable_cursor_based_pagination,
                 'max_buffer_size': self.config.max_pagination_buffer_size,
                 'min_buffer_size': self.config.min_pagination_buffer_size,
                 'enable_metrics': self.config.enable_pagination_metrics,
+                'cache_cleanup_keep_ratio': self.config.cache_cleanup_keep_ratio,
             },
         }