Skip to content

Commit a8e29c6

Browse files
feat: comprehensive test coverage improvements and code quality enhancements
- Improve test coverage from 93% to 97% (4,352 statements, 138 missed) - Add 17 new tests covering previously uncovered functions and error paths - Fix get_partition cache isolation issue in tests by adding setup_method - Add comprehensive tests for S3 models (get_presigned_url, validation edge cases, FASTQ pair detection) - Add tests for run_analysis instance type analysis and error handling - Add tests for S3 search engine (invalid tokens, buffer overflow, exception handling) - Add tests for HealthOmics search engine (fallback filtering, error handling) - Add tests for genomics search orchestrator (cache cleanup, timeout handling, coordination logic) - Replace magic numbers with centralized constants in consts.py - Add AWS partition detection with memoization for ARN construction - Enhance cache management with TTL-based cleanup and size limits - Add MCP timeout and search documentation to README - Remove line number references from test docstrings for maintainability - Fix duplicate fixture definitions and type errors - Ensure all linting, formatting, type checking, and security checks pass Total test count: 975 tests (up from 958) Coverage improvement: +4 percentage points All quality gates passing: Ruff, Pyright, Bandit, Pytest
1 parent ebd02bc commit a8e29c6

19 files changed

+1619
-78
lines changed

src/aws-healthomics-mcp-server/README.md

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,8 @@ uv run -m awslabs.aws_healthomics_mcp_server.server
401401
- `GENOMICS_SEARCH_TIMEOUT_SECONDS` - Search timeout in seconds (default: 300)
402402
- `GENOMICS_SEARCH_ENABLE_HEALTHOMICS` - Enable/disable HealthOmics sequence/reference store searches (default: true)
403403

404+
> **Note for Large S3 Buckets**: When searching very large S3 buckets (millions of objects), the genomics file search may take longer than the default MCP client timeout. If you encounter timeout errors, increase the MCP server timeout by adding a `"timeout"` property to your MCP server configuration (e.g., `"timeout": 300000` for five minutes, specified in milliseconds). This is particularly important when using the search tool with extensive S3 bucket configurations or when `GENOMICS_SEARCH_ENABLE_S3_TAG_SEARCH=true` is used with large datasets. The value of `"timeout"` should always be greater than the value of `GENOMICS_SEARCH_TIMEOUT_SECONDS` if you want to prevent the MCP timeout from preempting the genomics search timeout
405+
404406
#### Testing Configuration Variables
405407

406408
The following environment variables are primarily intended for testing scenarios, such as integration testing against mock service endpoints:
@@ -516,6 +518,7 @@ Add to your Claude Desktop configuration:
516518
"aws-healthomics": {
517519
"command": "uvx",
518520
"args": ["awslabs.aws-healthomics-mcp-server"],
521+
"timeout": 300000,
519522
"env": {
520523
"AWS_REGION": "us-east-1",
521524
"AWS_PROFILE": "your-profile",
@@ -541,6 +544,7 @@ For integration testing against mock services:
541544
"aws-healthomics-test": {
542545
"command": "uvx",
543546
"args": ["awslabs.aws-healthomics-mcp-server"],
547+
"timeout": 300000,
544548
"env": {
545549
"AWS_REGION": "us-east-1",
546550
"AWS_PROFILE": "test-profile",
@@ -572,7 +576,7 @@ For Windows users, the MCP server configuration format is slightly different:
572576
"mcpServers": {
573577
"awslabs.aws-healthomics-mcp-server": {
574578
"disabled": false,
575-
"timeout": 60,
579+
"timeout": 300000,
576580
"type": "stdio",
577581
"command": "uv",
578582
"args": [
@@ -606,7 +610,7 @@ For testing scenarios on Windows:
606610
"mcpServers": {
607611
"awslabs.aws-healthomics-mcp-server-test": {
608612
"disabled": false,
609-
"timeout": 60,
613+
"timeout": 300000,
610614
"type": "stdio",
611615
"command": "uv",
612616
"args": [

src/aws-healthomics-mcp-server/awslabs/aws_healthomics_mcp_server/consts.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,86 @@
9292
DEFAULT_GENOMICS_SEARCH_RESULT_CACHE_TTL = 600
9393
DEFAULT_GENOMICS_SEARCH_TAG_CACHE_TTL = 300
9494

95+
# Cache size limits - Maximum number of entries in the cache
96+
DEFAULT_GENOMICS_SEARCH_MAX_FILE_CACHE_SIZE = 10000
97+
DEFAULT_GENOMICS_SEARCH_MAX_TAG_CACHE_SIZE = 1000
98+
DEFAULT_GENOMICS_SEARCH_MAX_RESULT_CACHE_SIZE = 100
99+
DEFAULT_GENOMICS_SEARCH_MAX_PAGINATION_CACHE_SIZE = 50
100+
101+
# Cache cleanup behavior
102+
DEFAULT_CACHE_CLEANUP_KEEP_RATIO = 0.8 # Keep at most 80% of entries when cleaning up by size
103+
104+
# Search limits and pagination
105+
MAX_SEARCH_RESULTS_LIMIT = 10000 # Maximum allowed results per search
106+
DEFAULT_HEALTHOMICS_PAGE_SIZE = 100 # Default pagination size for HealthOmics APIs
107+
DEFAULT_S3_PAGE_SIZE = 1000 # Default pagination size for S3 operations
108+
DEFAULT_RESULT_RANKER_FALLBACK_SIZE = 100 # Fallback size when max_results is invalid
109+
110+
# Rate limiting and performance
111+
HEALTHOMICS_RATE_LIMIT_DELAY = 0.1 # Sleep delay between HealthOmics Storage API calls (10 TPS)
112+
113+
# Cache cleanup sweep probabilities for entries with expired TTLs (as percentages for clarity)
114+
PAGINATION_CACHE_CLEANUP_PROBABILITY = 1 # 1% chance (1 in 100)
115+
S3_CACHE_CLEANUP_PROBABILITY = 2 # 2% chance (1 in 50)
116+
117+
# Buffer size optimization thresholds
118+
CURSOR_PAGINATION_BUFFER_THRESHOLD = 5000 # Use cursor pagination above this buffer size
119+
CURSOR_PAGINATION_PAGE_THRESHOLD = 10 # Use cursor pagination above this page number
120+
BUFFER_EFFICIENCY_LOW_THRESHOLD = 0.1 # 10% efficiency threshold
121+
BUFFER_EFFICIENCY_HIGH_THRESHOLD = 0.5 # 50% efficiency threshold
122+
123+
# Buffer size complexity multipliers
124+
COMPLEXITY_MULTIPLIER_FILE_TYPE_FILTER = 0.8 # Reduce complexity when file type is filtered
125+
COMPLEXITY_MULTIPLIER_ASSOCIATED_FILES = 1.2 # Increase complexity for associated files
126+
COMPLEXITY_MULTIPLIER_BUFFER_OVERFLOW = 1.5 # Increase when buffer overflows occur
127+
COMPLEXITY_MULTIPLIER_LOW_EFFICIENCY = 2.0 # Increase when efficiency is low
128+
COMPLEXITY_MULTIPLIER_HIGH_EFFICIENCY = 0.8 # Decrease when efficiency is high
129+
130+
# Pattern matching thresholds and multipliers
131+
FUZZY_MATCH_THRESHOLD = 0.6 # Minimum similarity for fuzzy matches
132+
MULTIPLE_MATCH_BONUS_MULTIPLIER = 1.2 # 20% bonus for multiple pattern matches
133+
TAG_MATCH_PENALTY_MULTIPLIER = 0.9 # 10% penalty for tag matches vs path matches
134+
SUBSTRING_MATCH_MAX_MULTIPLIER = 0.8 # Maximum score multiplier for substring matches
135+
FUZZY_MATCH_MAX_MULTIPLIER = 0.6 # Maximum score multiplier for fuzzy matches
136+
137+
# Match quality score thresholds
138+
MATCH_QUALITY_EXCELLENT_THRESHOLD = 0.8
139+
MATCH_QUALITY_GOOD_THRESHOLD = 0.6
140+
MATCH_QUALITY_FAIR_THRESHOLD = 0.4
141+
142+
# Match quality labels
143+
MATCH_QUALITY_EXCELLENT = 'excellent'
144+
MATCH_QUALITY_GOOD = 'good'
145+
MATCH_QUALITY_FAIR = 'fair'
146+
MATCH_QUALITY_POOR = 'poor'
147+
148+
# Unit conversion constants
149+
BYTES_PER_KILOBYTE = 1024
150+
MILLISECONDS_PER_SECOND = 1000.0
151+
152+
# HealthOmics status constants
153+
HEALTHOMICS_STATUS_ACTIVE = 'ACTIVE'
154+
155+
# HealthOmics storage class constants
156+
HEALTHOMICS_STORAGE_CLASS_MANAGED = 'MANAGED'
157+
158+
# Storage tier constants
159+
STORAGE_TIER_HOT = 'hot'
160+
STORAGE_TIER_WARM = 'warm'
161+
STORAGE_TIER_COLD = 'cold'
162+
STORAGE_TIER_UNKNOWN = 'unknown'
163+
164+
# S3 storage class constants
165+
S3_STORAGE_CLASS_STANDARD = 'STANDARD'
166+
S3_STORAGE_CLASS_REDUCED_REDUNDANCY = 'REDUCED_REDUNDANCY'
167+
S3_STORAGE_CLASS_STANDARD_IA = 'STANDARD_IA'
168+
S3_STORAGE_CLASS_ONEZONE_IA = 'ONEZONE_IA'
169+
S3_STORAGE_CLASS_INTELLIGENT_TIERING = 'INTELLIGENT_TIERING'
170+
S3_STORAGE_CLASS_GLACIER = 'GLACIER'
171+
S3_STORAGE_CLASS_DEEP_ARCHIVE = 'DEEP_ARCHIVE'
172+
S3_STORAGE_CLASS_OUTPOSTS = 'OUTPOSTS'
173+
S3_STORAGE_CLASS_GLACIER_IR = 'GLACIER_IR'
174+
95175
# Error messages
96176

97177
ERROR_INVALID_STORAGE_TYPE = 'Invalid storage type. Must be one of: {}'

src/aws-healthomics-mcp-server/awslabs/aws_healthomics_mcp_server/models/search.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,12 @@ class SearchConfig:
194194
result_cache_ttl_seconds: int = 600 # Result cache TTL (10 minutes)
195195
tag_cache_ttl_seconds: int = 300 # Tag cache TTL (5 minutes)
196196

197+
# Cache size limits
198+
max_tag_cache_size: int = 1000 # Maximum number of tag cache entries
199+
max_result_cache_size: int = 100 # Maximum number of result cache entries
200+
max_pagination_cache_size: int = 50 # Maximum number of pagination cache entries
201+
cache_cleanup_keep_ratio: float = 0.8 # Ratio of entries to keep during size-based cleanup
202+
197203
# Pagination performance optimization settings
198204
enable_cursor_based_pagination: bool = (
199205
True # Enable cursor-based pagination for large datasets

src/aws-healthomics-mcp-server/awslabs/aws_healthomics_mcp_server/search/genomics_search_orchestrator.py

Lines changed: 93 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,19 @@
1717
import asyncio
1818
import secrets
1919
import time
20+
from awslabs.aws_healthomics_mcp_server.consts import (
21+
BUFFER_EFFICIENCY_HIGH_THRESHOLD,
22+
BUFFER_EFFICIENCY_LOW_THRESHOLD,
23+
COMPLEXITY_MULTIPLIER_ASSOCIATED_FILES,
24+
COMPLEXITY_MULTIPLIER_BUFFER_OVERFLOW,
25+
COMPLEXITY_MULTIPLIER_FILE_TYPE_FILTER,
26+
COMPLEXITY_MULTIPLIER_HIGH_EFFICIENCY,
27+
COMPLEXITY_MULTIPLIER_LOW_EFFICIENCY,
28+
CURSOR_PAGINATION_BUFFER_THRESHOLD,
29+
CURSOR_PAGINATION_PAGE_THRESHOLD,
30+
MAX_SEARCH_RESULTS_LIMIT,
31+
S3_CACHE_CLEANUP_PROBABILITY,
32+
)
2033
from awslabs.aws_healthomics_mcp_server.models import (
2134
GenomicsFile,
2235
GenomicsFileResult,
@@ -356,8 +369,10 @@ async def search_paginated(
356369
)
357370
self._cache_pagination_state(cache_key, cache_entry)
358371

359-
# Clean up expired cache entries periodically
360-
if secrets.randbelow(20) == 0: # 5% chance to clean up cache
372+
# Clean up expired cache entries periodically (reduced frequency due to size-based cleanup)
373+
if (
374+
secrets.randbelow(100) == 0
375+
): # Probability defined by PAGINATION_CACHE_CLEANUP_PROBABILITY
361376
try:
362377
self.cleanup_expired_pagination_cache()
363378
except Exception as e:
@@ -424,8 +439,8 @@ def _validate_search_request(self, request: GenomicsFileSearchRequest) -> None:
424439
if request.max_results <= 0:
425440
raise ValueError('max_results must be greater than 0')
426441

427-
if request.max_results > 10000:
428-
raise ValueError('max_results cannot exceed 10000')
442+
if request.max_results > MAX_SEARCH_RESULTS_LIMIT:
443+
raise ValueError(f'max_results cannot exceed {MAX_SEARCH_RESULTS_LIMIT}')
429444

430445
# Validate file_type if provided
431446
if request.file_type:
@@ -489,10 +504,11 @@ async def _execute_parallel_searches(
489504
else:
490505
logger.warning(f'Unexpected result type from {storage_system}: {type(result)}')
491506

492-
# Periodically clean up expired cache entries (approximately every 10th search)
507+
# Periodically clean up expired cache entries (reduced frequency due to size-based cleanup)
493508
if (
494-
secrets.randbelow(10) == 0 and self.s3_engine is not None
495-
): # 10% chance to clean up cache
509+
secrets.randbelow(100 // S3_CACHE_CLEANUP_PROBABILITY) == 0
510+
and self.s3_engine is not None
511+
): # Probability defined by S3_CACHE_CLEANUP_PROBABILITY
496512
try:
497513
self.s3_engine.cleanup_expired_cache_entries()
498514
except Exception as e:
@@ -1003,6 +1019,10 @@ def _cache_pagination_state(self, cache_key: str, entry: 'PaginationCacheEntry')
10031019
if not hasattr(self, '_pagination_cache'):
10041020
self._pagination_cache = {}
10051021

1022+
# Check if we need to clean up before adding
1023+
if len(self._pagination_cache) >= self.config.max_pagination_cache_size:
1024+
self._cleanup_pagination_cache_by_size()
1025+
10061026
entry.update_timestamp()
10071027
self._pagination_cache[cache_key] = entry
10081028
logger.debug(f'Cached pagination state for key: {cache_key}')
@@ -1030,26 +1050,26 @@ def _optimize_buffer_size(
10301050

10311051
# File type filtering reduces complexity
10321052
if request.file_type:
1033-
complexity_multiplier *= 0.8
1053+
complexity_multiplier *= COMPLEXITY_MULTIPLIER_FILE_TYPE_FILTER
10341054

10351055
# Associated files increase complexity
10361056
if request.include_associated_files:
1037-
complexity_multiplier *= 1.2
1057+
complexity_multiplier *= COMPLEXITY_MULTIPLIER_ASSOCIATED_FILES
10381058

10391059
# Adjust based on historical metrics
10401060
if metrics:
10411061
# If we had buffer overflows, increase buffer size
10421062
if metrics.buffer_overflows > 0:
1043-
complexity_multiplier *= 1.5
1063+
complexity_multiplier *= COMPLEXITY_MULTIPLIER_BUFFER_OVERFLOW
10441064

10451065
# If efficiency was low, increase buffer size
10461066
efficiency_ratio = metrics.total_results_fetched / max(
10471067
metrics.total_objects_scanned, 1
10481068
)
1049-
if efficiency_ratio < 0.1: # Less than 10% efficiency
1050-
complexity_multiplier *= 2.0
1051-
elif efficiency_ratio > 0.5: # More than 50% efficiency
1052-
complexity_multiplier *= 0.8
1069+
if efficiency_ratio < BUFFER_EFFICIENCY_LOW_THRESHOLD:
1070+
complexity_multiplier *= COMPLEXITY_MULTIPLIER_LOW_EFFICIENCY
1071+
elif efficiency_ratio > BUFFER_EFFICIENCY_HIGH_THRESHOLD:
1072+
complexity_multiplier *= COMPLEXITY_MULTIPLIER_HIGH_EFFICIENCY
10531073

10541074
optimized_size = int(base_buffer_size * complexity_multiplier)
10551075

@@ -1098,9 +1118,63 @@ def _should_use_cursor_pagination(
10981118
"""
10991119
# Use cursor pagination for large buffer sizes or high page numbers
11001120
return self.config.enable_cursor_based_pagination and (
1101-
request.pagination_buffer_size > 5000 or global_token.page_number > 10
1121+
request.pagination_buffer_size > CURSOR_PAGINATION_BUFFER_THRESHOLD
1122+
or global_token.page_number > CURSOR_PAGINATION_PAGE_THRESHOLD
1123+
)
1124+
1125+
def _cleanup_pagination_cache_by_size(self) -> None:
1126+
"""Clean up pagination cache when it exceeds max size, prioritizing expired entries first.
1127+
1128+
Strategy:
1129+
1. First: Remove all expired entries (regardless of age)
1130+
2. Then: If still over size limit, remove oldest non-expired entries
1131+
"""
1132+
if not hasattr(self, '_pagination_cache'):
1133+
return
1134+
1135+
if len(self._pagination_cache) < self.config.max_pagination_cache_size:
1136+
return
1137+
1138+
target_size = int(
1139+
self.config.max_pagination_cache_size * self.config.cache_cleanup_keep_ratio
11021140
)
11031141

1142+
# Separate expired and valid entries
1143+
expired_items = []
1144+
valid_items = []
1145+
1146+
for key, entry in self._pagination_cache.items():
1147+
if entry.is_expired(self.config.pagination_cache_ttl_seconds):
1148+
expired_items.append((key, entry))
1149+
else:
1150+
valid_items.append((key, entry))
1151+
1152+
# Phase 1: Remove all expired items first
1153+
expired_count = len(expired_items)
1154+
for key, _ in expired_items:
1155+
del self._pagination_cache[key]
1156+
1157+
# Phase 2: If still over target size, remove oldest valid items
1158+
remaining_count = len(self._pagination_cache)
1159+
additional_removals = 0
1160+
1161+
if remaining_count > target_size:
1162+
# Sort valid items by timestamp (oldest first)
1163+
valid_items.sort(key=lambda x: x[1].timestamp)
1164+
additional_to_remove = remaining_count - target_size
1165+
1166+
for i in range(min(additional_to_remove, len(valid_items))):
1167+
key, _ = valid_items[i]
1168+
if key in self._pagination_cache: # Double-check key still exists
1169+
del self._pagination_cache[key]
1170+
additional_removals += 1
1171+
1172+
total_removed = expired_count + additional_removals
1173+
if total_removed > 0:
1174+
logger.debug(
1175+
f'Smart pagination cache cleanup: removed {expired_count} expired + {additional_removals} oldest valid = {total_removed} total entries, {len(self._pagination_cache)} remaining'
1176+
)
1177+
11041178
def cleanup_expired_pagination_cache(self) -> None:
11051179
"""Clean up expired pagination cache entries to prevent memory leaks."""
11061180
if not hasattr(self, '_pagination_cache'):
@@ -1136,10 +1210,14 @@ def get_pagination_cache_stats(self) -> Dict[str, Any]:
11361210
'total_entries': len(self._pagination_cache),
11371211
'valid_entries': valid_entries,
11381212
'ttl_seconds': self.config.pagination_cache_ttl_seconds,
1213+
'max_cache_size': self.config.max_pagination_cache_size,
1214+
'cache_utilization': len(self._pagination_cache)
1215+
/ self.config.max_pagination_cache_size,
11391216
'config': {
11401217
'enable_cursor_pagination': self.config.enable_cursor_based_pagination,
11411218
'max_buffer_size': self.config.max_pagination_buffer_size,
11421219
'min_buffer_size': self.config.min_pagination_buffer_size,
11431220
'enable_metrics': self.config.enable_pagination_metrics,
1221+
'cache_cleanup_keep_ratio': self.config.cache_cleanup_keep_ratio,
11441222
},
11451223
}

0 commit comments

Comments
 (0)