diff --git a/.github/workflows/pre-commit.yml b/.github/workflows/pre-commit.yml index 0a2dd89..d643504 100644 --- a/.github/workflows/pre-commit.yml +++ b/.github/workflows/pre-commit.yml @@ -10,7 +10,7 @@ on: jobs: pre-commit: runs-on: ubuntu-latest - name: Run pre-commit hooks on Go, Rust, JavaScripts and Python files + name: Run pre-commit hooks on Go, Rust, JavaScripts, Markdown and Python files steps: - name: Check out the repo @@ -45,7 +45,8 @@ jobs: sudo apt-get install -y \ make \ build-essential \ - pkg-config + pkg-config + npm install -g markdownlint-cli - name: Cache Rust dependencies uses: actions/cache@v4 @@ -81,10 +82,10 @@ jobs: - name: Install pre-commit run: pip install pre-commit - - name: Run pre-commit on Go, Rust, JavaScript and Python files + - name: Run pre-commit on Go, Rust, JavaScript, Markdown and Python files run: | - # Find all Go, Rust, JavaScripts and Python files (excluding vendored/generated code) - FILES=$(find . -type f \( -name "*.go" -o -name "*.rs" -o -name "*.py" -o -name "*.js" \) \ + # Find all Go, Rust, JavaScripts, Markdown and Python files (excluding vendored/generated code) + FILES=$(find . -type f \( -name "*.go" -o -name "*.rs" -o -name "*.py" -o -name "*.js" -o -name "*.md" \) \ ! -path "./target/*" \ ! -path "./candle-binding/target/*" \ ! -path "./.git/*" \ @@ -99,7 +100,7 @@ jobs: echo "Running pre-commit on files: $FILES" pre-commit run --files $FILES else - echo "No Go, Rust, JavaScript or Python files found to check" + echo "No Go, Rust, JavaScript, Markdown or Python files found to check" fi - name: Show pre-commit results diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1af836e..38d36c9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,7 +1,7 @@ # See https://pre-commit.com for more information # See https://pre-commit.com/hooks.html for more hooks repos: -# Basic hooks for Go, Rust, Python files only +# Basic hooks for Go, Rust, Python And JavaScript files only - repo: https://github.com/pre-commit/pre-commit-hooks rev: v6.0.0 hooks: @@ -22,6 +22,16 @@ repos: language: system files: \.go$ +# Markdown specific hooks +- repo: local + hooks: + - id: md-fmt + name: md fmt + entry: bash -c "make markdown-lint" + language: system + files: \.md$ + exclude: ^(\node_modules/) + # JavaScript specific hooks - repo: local hooks: diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 03f4f3e..01f9129 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -29,18 +29,22 @@ Before you begin, ensure you have the following installed: ### Initial Setup 1. **Clone the repository:** + ```bash git clone https://github.com/vllm-project/semantic-router.git cd semantic-router ``` 2. **Download required models:** + ```bash make download-models ``` + This downloads the pre-trained classification models from Hugging Face. 3. **Install Python dependencies(Optional):** + ```bash # For training and development pip install -r requirements.txt @@ -54,6 +58,7 @@ Before you begin, ensure you have the following installed: The project consists of multiple components that need to be built in order: ### Build Everything + ```bash make build ``` @@ -61,11 +66,13 @@ make build ### Build Individual Components 1. **Rust library (Candle binding):** + ```bash make rust ``` 2. **Go router:** + ```bash make build-router ``` @@ -73,11 +80,13 @@ make build ### Running the System 1. **Start Envoy proxy** (in one terminal): + ```bash make run-envoy ``` 2. **Start the semantic router** (in another terminal): + ```bash make run-router ``` @@ -87,16 +96,19 @@ make build ### Unit Tests 1. **Test Rust bindings:** + ```bash make test-binding ``` 2. **Test Go semantic router:** + ```bash make test-semantic-router ``` 3. **Test individual classifiers:** + ```bash make test-category-classifier make test-pii-classifier @@ -141,18 +153,19 @@ python e2e-tests/run_all_tests.py --check-only The test suite includes: -+ Basic client request tests -+ Envoy ExtProc interaction tests -+ Router classification tests -+ Semantic cache tests -+ Category-specific tests -+ Metrics validation tests +- Basic client request tests +- Envoy ExtProc interaction tests +- Router classification tests +- Semantic cache tests +- Category-specific tests +- Metrics validation tests ## Development Workflow ### Making Changes 1. **Create a feature branch:** + ```bash git checkout -b feature/your-feature-name ``` @@ -160,6 +173,7 @@ The test suite includes: 2. **Make your changes** following the project structure and coding standards. 3. **Build and test:** + ```bash make clean make build @@ -167,6 +181,7 @@ The test suite includes: ``` 4. **Run end-to-end tests:** + ```bash # Start services make run-envoy & @@ -179,6 +194,7 @@ The test suite includes: 5. **Commit your changes:** Commit your changes with a clear message, making sure to **sign off** on your work using the `-s` flag. This is required by the project's **Developer Certificate of Origin (DCO)**. + ```bash git add . git commit -s -m "feat: add your feature description" @@ -197,6 +213,7 @@ The test suite includes: Before submitting a PR, please run the pre-commit hooks to ensure code quality and consistency. **These checks are mandatory** and will be automatically run on every commit once installed. **Step 1: Install pre-commit tool** + ```bash # Using pip (recommended) pip install pre-commit @@ -209,6 +226,7 @@ brew install pre-commit ``` **Step 2: Install pre-commit hooks for this repository** + ```bash # Install pre-commit hooks pre-commit install @@ -218,6 +236,7 @@ pre-commit run --all-files ``` ### Go Code + - Follow standard Go formatting (`gofmt`) - Use meaningful variable and function names - Add comments for exported functions and types @@ -228,12 +247,14 @@ pre-commit run --all-files - The CI will automatically check that `go.mod` and `go.sum` files are tidy using `make check-go-mod-tidy` ### Rust Code + - Follow Rust formatting (`cargo fmt`) - Use `cargo clippy` for linting - Handle errors appropriately with `Result` types - Document public APIs ### Python Code + - Follow PEP 8 style guidelines - Use type hints where appropriate - Write docstrings for functions and classes @@ -241,6 +262,7 @@ pre-commit run --all-files ## Submitting Changes 1. **Ensure all tests pass:** + ```bash make test python e2e-tests/run_all_tests.py diff --git a/Makefile b/Makefile index 0e5bc1b..510e019 100644 --- a/Makefile +++ b/Makefile @@ -343,3 +343,11 @@ docs-lint: docs-lint-fix: @echo "Fixing documentation lint issues..." cd website && npm run lint:fix + +markdown-lint: + @echo "Linting markdown files..." + markdownlint -c markdownlint.yaml "**/*.md" --ignore node_modules --ignore website/node_modules + +markdown-lint-fix: + @echo "Fixing markdown lint issues..." + markdownlint -c markdownlint.yaml "**/*.md" --ignore node_modules --ignore website/node_modules --fix diff --git a/README.md b/README.md index b590103..d464f3b 100644 --- a/README.md +++ b/README.md @@ -6,17 +6,17 @@ [![Hugging Face](https://img.shields.io/badge/πŸ€—%20Hugging%20Face-Community-yellow)](https://huggingface.co/LLM-Semantic-Router) [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](LICENSE) [![Crates.io](https://img.shields.io/crates/v/candle-semantic-router.svg)](https://crates.io/crates/candle-semantic-router) -![](https://github.com/vllm-project/semantic-router/workflows/Test%20And%20Build/badge.svg) +![Test And Build](https://github.com/vllm-project/semantic-router/workflows/Test%20And%20Build/badge.svg) **πŸ“š [Complete Documentation](https://vllm-semantic-router.com) | πŸš€ [Quick Start](https://vllm-semantic-router.com/docs/getting-started/installation) | πŸ“£ [Blog](https://vllm-semantic-router.com/blog/) | πŸ“– [API Reference](https://vllm-semantic-router.com/docs/api/router/)** -![](./website/static/img/code.png) +![code](./website/static/img/code.png) ## Innovations ✨ -![](./website/static/img/architecture.png) +![architecture](./website/static/img/architecture.png) ### Intelligent Routing 🧠 @@ -35,6 +35,7 @@ The screenshot below shows the LLM Router dashboard in Grafana. ![LLM Router Dashboard](./website/static/img/grafana_screenshot.png) The router is implemented in two ways: + - Golang (with Rust FFI based on the [candle](https://github.com/huggingface/candle) rust ML framework) - Python Benchmarking will be conducted to determine the best implementation. @@ -64,6 +65,7 @@ For comprehensive documentation including detailed setup instructions, architect **πŸ‘‰ [Complete Documentation at Read the Docs](https://vllm-semantic-router.com/)** The documentation includes: + - **[Installation Guide](https://vllm-semantic-router.com/docs/getting-started/installation/)** - Complete setup instructions - **[System Architecture](https://vllm-semantic-router.com/docs/architecture/system-architecture/)** - Technical deep dive - **[Model Training](https://vllm-semantic-router.com/docs/training/training-overview/)** - How classification models work @@ -90,4 +92,4 @@ If you find Semantic Router helpful in your research or projects, please conside We opened the project at Aug 31, 2025. We love open source and collaboration ❀️ -[![Star History Chart](https://api.star-history.com/svg?repos=vllm-project/semantic-router&type=Date)](https://www.star-history.com/#vllm-project/semantic-router&Date) \ No newline at end of file +[![Star History Chart](https://api.star-history.com/svg?repos=vllm-project/semantic-router&type=Date)](https://www.star-history.com/#vllm-project/semantic-router&Date) diff --git a/candle-binding/README.md b/candle-binding/README.md index 5f6c2c8..7338f15 100644 --- a/candle-binding/README.md +++ b/candle-binding/README.md @@ -33,9 +33,11 @@ go test -v - The `-v` flag enables verbose output. - If you want to run a specific test, use: + ```sh go test -v -run TestName ``` + Replace `TestName` with the name of the test function. ## Troubleshooting @@ -46,4 +48,4 @@ go test -v ## Notes - The Go tests depend on the native library being present and correctly built. -- Some tests may download data from the internet (e.g., from norvig.com). \ No newline at end of file +- Some tests may download data from the internet (e.g., from norvig.com). diff --git a/deploy/kubernetes/README.md b/deploy/kubernetes/README.md index 0817662..2b8007f 100644 --- a/deploy/kubernetes/README.md +++ b/deploy/kubernetes/README.md @@ -22,7 +22,6 @@ The deployment consists of: ## Deployment - ```bash kubectl apply -k deploy/kubernetes/ diff --git a/docker/README.md b/docker/README.md index 37daa76..cc868f9 100644 --- a/docker/README.md +++ b/docker/README.md @@ -10,21 +10,25 @@ This Docker Compose configuration allows you to quickly run Semantic Router + En ## Install in Docker Compose 1. **Clone the repository and navigate to the project directory** + ```bash git clone cd semantic_router ``` 2. **Download required models** (if not already present): + ```bash make download-models ``` + This will download the necessary ML models for classification: - Category classifier (ModernBERT-base) - PII classifier (ModernBERT-base) - Jailbreak classifier (ModernBERT-base) 3. **Start the services using Docker Compose** + ```bash # Start core services (semantic-router + envoy) docker-compose up --build diff --git a/e2e-tests/README.md b/e2e-tests/README.md index 478503a..3e9ab1c 100644 --- a/e2e-tests/README.md +++ b/e2e-tests/README.md @@ -38,11 +38,13 @@ This test suite provides a progressive approach to testing the Semantic Router, ## Running Tests Individual tests can be run with: + ``` python tests/XX-test-name.py ``` Or run all tests sequentially with: + ``` cd tests && python -m pytest ``` @@ -51,4 +53,4 @@ cd tests && python -m pytest - Envoy must be running (make run-envoy) - Router must be running (make run-router) -- Python dependencies installed \ No newline at end of file +- Python dependencies installed diff --git a/markdownlint.yaml b/markdownlint.yaml new file mode 100644 index 0000000..0e00f8e --- /dev/null +++ b/markdownlint.yaml @@ -0,0 +1,52 @@ +# MD003 heading-style/header-style Heading style +MD003: false + +# MD025/single-title/single-h1 +MD025: false + +# For deep learning docs end + +# MD013 Line length +MD013: false + +# MD014 Dollar signs used before commands without showing output +MD014: false + +# MD024 Multiple headings with the same content +MD024: false + +# MD026/no-trailing-punctuation Trailing punctuation in heading +MD026: false + +# MD029/ol-prefix Ordered list item prefix +MD029: false + +# MD033/no-inline-html +MD033: false + +# MD034/no-bare-urls +MD034: false + +# MD040/fenced-code-language +MD040: false + +# MD041/first-line-heading/first-line-h1 First line in file should be a top level heading +MD041: false + +# MD036/emphasis used instead of a header +MD036: false + +# MD037/no-space-in-emphasis Spaces inside emphasis markers +MD037: false + +# MD046/Two formats of code blocks are allowed: fenced and indented +MD046: false + +# blanks-around-headings、blanks-around-headers +MD022: false + +# Trailing spaces +MD009: false + +# MD005/list-indent Inconsistent indentation for list items +MD005: false diff --git a/src/training/dual_classifier/DUAL_CLASSIFIER_SYSTEM_TEST_SUMMARY.md b/src/training/dual_classifier/DUAL_CLASSIFIER_SYSTEM_TEST_SUMMARY.md index 80675db..3aafe64 100644 --- a/src/training/dual_classifier/DUAL_CLASSIFIER_SYSTEM_TEST_SUMMARY.md +++ b/src/training/dual_classifier/DUAL_CLASSIFIER_SYSTEM_TEST_SUMMARY.md @@ -8,46 +8,54 @@ Task 2 successfully implemented and tested a complete dual-purpose DistilBERT cl ### βœ… Component Tests (14/14 Passed) #### 1. Synthetic Data Generator Tests + - **Initialization**: Validates proper setup of 10 categories, templates, and 5 PII pattern types - **Sample Generation**: Tests both PII and non-PII sample creation with proper labeling - **Dataset Generation**: Validates batch dataset creation with configurable PII ratios - **PII Pattern Detection**: Confirms email and phone number detection in text #### 2. Dual-Task Dataset Tests + - **Dataset Creation**: Validates PyTorch Dataset implementation with correct tensor shapes - **Tokenization**: Tests DistilBERT tokenizer integration with proper padding/truncation - **Label Alignment**: Ensures category and PII labels align with tokenized sequences #### 3. Dual-Task Loss Function Tests + - **Loss Initialization**: Validates weighted loss combining category and PII objectives - **Loss Computation**: Tests gradient flow and loss calculation for both tasks - **Padding Mask Handling**: Ensures padded tokens are properly ignored in PII loss #### 4. Dual-Task Trainer Tests + - **Trainer Initialization**: Validates setup with proper data loaders and optimizers - **Training Step**: Confirms model parameters update during training - **Evaluation**: Tests validation metrics calculation (accuracy, F1-score) - **Model Persistence**: Validates save/load functionality with state preservation #### 5. Integration Tests + - **End-to-End Training**: Complete training pipeline with 2 epochs - **Memory Efficiency**: Confirms dual-head architecture has reasonable parameter count (~67M) ## Performance Results ### Training Performance + - **Dataset Size**: 50 training samples, 20 validation samples - **Training Time**: 18.6 seconds (0.372 seconds per sample) - **Performance Rating**: πŸš€ Excellent performance! - **System**: 8-core CPU, 16GB RAM (no GPU required) ### Model Architecture + - **Base Model**: DistilBERT (66M parameters) - **Total Parameters**: 67,553,292 (efficient shared backbone) - **Category Head**: 10-class classification - **PII Head**: Token-level binary classification ### Training Results (From Previous Run) + - **Final Training Metrics**: - Training Loss: 1.4948 - Category Loss: 1.3069 @@ -60,6 +68,7 @@ Task 2 successfully implemented and tested a complete dual-purpose DistilBERT cl ## Test Infrastructure ### Automated Testing + ```bash # Run full test suite python -m pytest test_dual_classifier_system.py -v @@ -69,6 +78,7 @@ python test_dual_classifier_system.py ``` ### Manual Validation + ```bash # Test existing trained model python test_existing_model.py @@ -77,22 +87,26 @@ python test_existing_model.py ## Key Technical Achievements ### 1. **Multi-Task Learning Architecture** + - Single DistilBERT backbone serving dual purposes - Separate classification heads for different tasks - Shared representations for memory efficiency ### 2. **Robust Training Pipeline** + - Combined loss function with task weighting - Proper gradient flow and parameter updates - Validation metrics for both tasks ### 3. **Synthetic Data Generation** + - 10 category templates (math, science, history, etc.) - 5 PII pattern types (email, phone, SSN, name, address) - Configurable PII injection rates - Token-level PII labeling ### 4. **Production-Ready Features** + - Model persistence (save/load) - Training history tracking - Progress monitoring with tqdm @@ -101,21 +115,25 @@ python test_existing_model.py ## Testing Methodology ### Unit Tests + - Individual component validation - Mock data for isolated testing - Edge case handling ### Integration Tests + - Full pipeline validation - Real data flow testing - Performance benchmarking ### Validation Tests + - Model loading/saving - Prediction consistency - Memory efficiency ## File Structure + ``` dual_classifier/ β”œβ”€β”€ test_dual_classifier_system.py # Comprehensive test suite @@ -140,12 +158,14 @@ dual_classifier/ ## Next Steps Task 2 is fully complete and validated. The implementation provides a solid foundation for: + - Task 3: Data Pipeline Implementation (real dataset integration) - Task 4: Advanced Training Pipeline (optimization and scaling) - Task 5: Rust Implementation with Candle (performance optimization) ## Performance Notes + - Training completes in under 20 seconds for 50 samples - Model achieves 45% category accuracy and 91% PII F1-score on small synthetic dataset - Memory usage is efficient for laptop deployment -- No GPU required for development and testing \ No newline at end of file +- No GPU required for development and testing diff --git a/src/training/dual_classifier/README.md b/src/training/dual_classifier/README.md index b9cd982..dc10887 100644 --- a/src/training/dual_classifier/README.md +++ b/src/training/dual_classifier/README.md @@ -10,6 +10,7 @@ A dual-purpose neural network classifier using DistilBERT for both **category cl This project implements a **proof-of-concept** dual-task learning system that demonstrates: ### βœ… **Key Accomplishments (Task 2)** + - βœ… **Dual-Purpose Architecture**: Single DistilBERT model (~67M parameters) for both category classification and PII detection - βœ… **Memory Efficiency**: Shared backbone reduces parameters vs. two separate models - βœ… **Synthetic Data Pipeline**: Complete data generation with 10 categories and 5 PII pattern types @@ -20,6 +21,7 @@ This project implements a **proof-of-concept** dual-task learning system that de - βœ… **Production-Ready Features**: Progress tracking, metrics, and model checkpointing ### πŸ”¬ **POC Characteristics** + - **Data Source**: **Synthetic data generation** (not real-world datasets) - **Scale**: Small-scale validation (50 training, 20 validation samples) - **Purpose**: Architecture validation and training pipeline proof @@ -27,11 +29,13 @@ This project implements a **proof-of-concept** dual-task learning system that de - **PII Patterns**: 5 predefined types (email, phone, SSN, name, address) ### πŸš€ **Next Steps Roadmap** + - **Task 3**: Real dataset integration (transition from synthetic to production data) - **Task 4**: Advanced training optimization and scaling - **Task 5**: Rust implementation with Candle framework This POC successfully demonstrates that: + - The dual-head architecture works effectively - Multi-task learning can be implemented efficiently - The training pipeline is robust and measurable @@ -45,6 +49,7 @@ This POC successfully demonstrates that: #### `dual_classifier.py` **Main Model Implementation** + - Contains the `DualClassifier` class built on DistilBERT - Implements dual-head architecture: - **Category Head**: Sequence-level classification for 10 categories @@ -55,6 +60,7 @@ This POC successfully demonstrates that: #### `trainer.py` **Training Infrastructure** + - `DualTaskDataset`: PyTorch Dataset class for handling dual-task data - `DualTaskLoss`: Combined loss function for both classification tasks - `DualTaskTrainer`: Complete training pipeline with: @@ -66,6 +72,7 @@ This POC successfully demonstrates that: #### `data_generator.py` **Synthetic Data Generation** + - `SyntheticDataGenerator`: Creates realistic training data - **Categories**: 10 predefined categories with template texts - **PII Patterns**: 5 types (email, phone, SSN, name, address) @@ -79,6 +86,7 @@ This POC successfully demonstrates that: #### `train_example.py` **Training Demonstration** + - Complete end-to-end training example - Shows system performance monitoring (CPU, memory, GPU) - Demonstrates model training with synthetic data @@ -88,6 +96,7 @@ This POC successfully demonstrates that: #### `example.py` **Basic Usage Example** + - Simple demonstration of model usage - Shows how to: - Initialize the DualClassifier @@ -98,6 +107,7 @@ This POC successfully demonstrates that: #### `test_existing_model.py` **Trained Model Validation** + - Tests loading and using a pre-trained model - Validates that saved models work correctly - Demonstrates prediction on sample texts @@ -108,6 +118,7 @@ This POC successfully demonstrates that: #### `test_dual_classifier_system.py` **Comprehensive Test Suite** + - **14 Test Cases** covering all components: - Synthetic data generator functionality - Dataset creation and tokenization @@ -120,6 +131,7 @@ This POC successfully demonstrates that: #### `test_dual_classifier.py` **Core Model Tests** + - Unit tests for the `DualClassifier` class - Tests model initialization, forward pass, and prediction methods - Validates tensor shapes and output formats @@ -130,6 +142,7 @@ This POC successfully demonstrates that: #### `requirements.txt` **Project Dependencies** + - **PyTorch**: `>=2.0.0,<=2.2.2` (Neural network backend) - **Transformers**: `>=4.36.0,<4.45.0` (DistilBERT model) - **NumPy**: `>=1.24.0,<2.0` (Numerical operations) @@ -141,6 +154,7 @@ This POC successfully demonstrates that: #### `DUAL_CLASSIFIER_SYSTEM_TEST_SUMMARY.md` **Test Results & Documentation** + - Comprehensive testing summary with all results - Performance benchmarks and system requirements - Technical achievements and success criteria @@ -151,6 +165,7 @@ This POC successfully demonstrates that: #### `trained_model/` Directory **Saved Model Files** + - `model.pt` (258MB): Complete trained model state - `config.json`: Model configuration and hyperparameters - `training_history.json`: Training metrics and loss curves @@ -161,26 +176,31 @@ This POC successfully demonstrates that: ## πŸš€ Quick Start ### 1. Install Dependencies + ```bash pip install -r requirements.txt ``` ### 2. Run Basic Example + ```bash python example.py ``` ### 3. Train Your Own Model + ```bash python train_example.py ``` ### 4. Test Existing Model + ```bash python test_existing_model.py ``` ### 5. Run Full Test Suite + ```bash python -m pytest test_dual_classifier_system.py -v ``` @@ -188,6 +208,7 @@ python -m pytest test_dual_classifier_system.py -v ## πŸ—οΈ Architecture ### Model Architecture + - **Base Model**: DistilBERT (66M parameters) - **Total Parameters**: 67,553,292 - **Category Head**: 10-class sequence classification @@ -195,6 +216,7 @@ python -m pytest test_dual_classifier_system.py -v - **Shared Backbone**: Memory-efficient design ### Training Pipeline + - **Multi-task Loss**: Weighted combination of category and PII losses - **Metrics**: Category accuracy and PII F1-score - **Data**: Synthetic generation with configurable PII injection @@ -203,11 +225,13 @@ python -m pytest test_dual_classifier_system.py -v ## πŸ“Š Performance ### Training Performance + - **Training Time**: ~18.6 seconds for 50 samples - **System Requirements**: 8-core CPU, 16GB RAM (no GPU required) - **Memory Efficiency**: Single model vs. two separate models ### Model Performance + - **Category Accuracy**: 45% (on small synthetic dataset) - **PII F1-Score**: 91.09% - **Training Loss**: 1.4948 (final) @@ -216,6 +240,7 @@ python -m pytest test_dual_classifier_system.py -v ## πŸ§ͺ Testing The project includes comprehensive testing with 14 test cases covering: + - βœ… Synthetic data generation - βœ… Dataset creation and tokenization - βœ… Loss function computation @@ -228,6 +253,7 @@ All tests pass with excellent performance ratings. ## πŸ“ˆ Next Steps This implementation provides a foundation for: + - **Task 3**: Real dataset integration - **Task 4**: Advanced training optimization - **Task 5**: Rust implementation with Candle framework @@ -235,6 +261,7 @@ This implementation provides a foundation for: ## 🀝 Usage Examples ### Basic Prediction + ```python from dual_classifier import DualClassifier @@ -301,12 +328,14 @@ for token, pred in zip(tokens, pii_predictions): ``` **Key Points:** + - πŸ“ **Single Input, Dual Output**: One text β†’ category + PII results simultaneously - πŸ”„ **`encode_text()`**: Just preprocessing, no predictions - 🎯 **`predict()`**: Does BOTH tasks at once using shared DistilBERT backbone - 🧠 **Memory Efficient**: Single model handles both tasks vs. separate models ### Training New Model + ```python from trainer import DualTaskTrainer from data_generator import create_sample_datasets @@ -317,4 +346,4 @@ train_dataset, val_dataset = create_sample_datasets() # Train model trainer = DualTaskTrainer(model, train_dataset, val_dataset) trainer.train(num_epochs=2) -``` \ No newline at end of file +``` diff --git a/src/training/dual_classifier/trained_model/README.md b/src/training/dual_classifier/trained_model/README.md index b13ef70..78738f5 100644 --- a/src/training/dual_classifier/trained_model/README.md +++ b/src/training/dual_classifier/trained_model/README.md @@ -14,6 +14,7 @@ This directory contains the trained model files for the dual classifier. Due to ## To generate these files: Run the training script to create a new model: + ```bash cd dual_classifier python train_example.py @@ -22,6 +23,7 @@ python train_example.py ## Alternative storage: For sharing large model files, consider: + - Git LFS (Large File Storage) - Cloud storage (S3, Google Drive, etc.) -- Model registries (HuggingFace Hub, MLflow, etc.) \ No newline at end of file +- Model registries (HuggingFace Hub, MLflow, etc.) diff --git a/website/README.md b/website/README.md index 9713709..d0a11c0 100644 --- a/website/README.md +++ b/website/README.md @@ -5,12 +5,14 @@ This directory contains the Docusaurus-based documentation website for the vLLM ## πŸš€ Quick Start ### Prerequisites + - Node.js 18+ - npm or yarn ### Development Start the development server with hot reload: + ```bash # From project root make docs-dev @@ -24,6 +26,7 @@ The site will be available at http://localhost:3000 ### Production Build Build the static site for production: + ```bash # From project root make docs-build @@ -35,6 +38,7 @@ cd website && npm run build ### Preview Production Build Serve the production build locally: + ```bash # From project root make docs-serve @@ -46,18 +50,21 @@ cd website && npm run serve ## 🎨 Features ### ✨ Modern Tech-Inspired Design + - **Dark theme by default** with neon blue/green accents - **Glassmorphism effects** with backdrop blur and transparency - **Gradient backgrounds** and animated hover effects - **Responsive design** optimized for all devices ### πŸ”§ Enhanced Functionality + - **Mermaid diagram support** with dark theme optimization - **Advanced code highlighting** with multiple language support - **Interactive navigation** with smooth animations - **Search functionality** (ready for Algolia integration) ### πŸ“± User Experience + - **Fast loading** with optimized builds - **Accessible design** following WCAG guidelines - **Mobile-first** responsive layout @@ -82,6 +89,7 @@ website/ ### Themes and Colors Edit `src/css/custom.css` to modify: + - Color scheme and gradients - Typography and spacing - Component styling @@ -89,12 +97,14 @@ Edit `src/css/custom.css` to modify: ### Navigation Update `sidebars.js` to modify: + - Documentation structure - Category organization - Page ordering ### Site Configuration Modify `docusaurus.config.js` for: + - Site metadata - Plugin configuration - Theme settings diff --git a/website/blog/2025-09-06-welcome.md b/website/blog/2025-09-06-welcome.md index 895f126..48d4318 100644 --- a/website/blog/2025-09-06-welcome.md +++ b/website/blog/2025-09-06-welcome.md @@ -5,7 +5,7 @@ authors: [rootfs, wangchen615, yuezhu1, Xunzhuo] tags: [welcome, announcement, vllm, semantic-router] --- -![](/img/code.png) +![code](/img/code.png) @@ -51,7 +51,7 @@ To overcome this gap, we introduce the **vLLM Semantic Router** β€” an intent-aw By classifying queries at the semantic level and selectively enabling reasoning, the vLLM Semantic Router delivers **higher accuracy where it matters** and **significant cost savings where it doesn’t** β€” a step toward the principle that no token should be wasted. -![](/img/architecture.png) +![architecture](/img/architecture.png) ### Architecture Design diff --git a/website/docs/api/classification.md b/website/docs/api/classification.md index cda2344..8ddbecb 100644 --- a/website/docs/api/classification.md +++ b/website/docs/api/classification.md @@ -5,6 +5,7 @@ The Classification API provides direct access to the Semantic Router's classific ## API Endpoints ### Base URL + ``` http://localhost:8080/api/v1/classify ``` @@ -12,11 +13,13 @@ http://localhost:8080/api/v1/classify ## Server Status The Classification API server runs alongside the main Semantic Router ExtProc server: + - **Classification API**: `http://localhost:8080` (HTTP REST API) - **ExtProc Server**: `http://localhost:50051` (gRPC for Envoy integration) - **Metrics Server**: `http://localhost:9190` (Prometheus metrics) Start the server with: + ```bash make run-router ``` @@ -24,6 +27,7 @@ make run-router ## Implementation Status ### βœ… Fully Implemented + - `GET /health` - Health check endpoint - `POST /api/v1/classify/intent` - Intent classification with real model inference - `POST /api/v1/classify/pii` - PII detection with real model inference @@ -33,6 +37,7 @@ make run-router - `GET /info/classifier` - Detailed classifier capabilities and configuration ### πŸ”„ Placeholder Implementation + - `POST /api/v1/classify/combined` - Returns "not implemented" response - `GET /metrics/classification` - Returns "not implemented" response - `GET /config/classification` - Returns "not implemented" response @@ -122,6 +127,7 @@ Classify user queries into routing categories. ### Available Categories The current model supports the following 14 categories: + - `business` - `law` - `psychology` @@ -369,6 +375,7 @@ api: ### Error Handling **Batch Too Large (400 Bad Request):** + ```json { "error": { @@ -380,6 +387,7 @@ api: ``` **Empty Batch (400 Bad Request):** + ```json { "error": { @@ -625,6 +633,7 @@ Get real-time classification performance metrics. ### Example Error Responses **Invalid Input (400 Bad Request):** + ```json { "error": { @@ -636,6 +645,7 @@ Get real-time classification performance metrics. ``` **Not Implemented (501 Not Implemented):** + ```json { "error": { diff --git a/website/docs/api/router.md b/website/docs/api/router.md index 9795ac5..4ec6b01 100644 --- a/website/docs/api/router.md +++ b/website/docs/api/router.md @@ -271,6 +271,7 @@ model_config: ``` Notes: + - Pricing is optional; if omitted, cost is treated as 0 and only token metrics are emitted. - Cost is computed as: (prompt_tokens * prompt_per_1m + completion_tokens * completion_per_1m) / 1_000_000 (in the configured currency). diff --git a/website/docs/architecture/system-architecture.md b/website/docs/architecture/system-architecture.md index 4eb5ced..71420ed 100644 --- a/website/docs/architecture/system-architecture.md +++ b/website/docs/architecture/system-architecture.md @@ -80,6 +80,7 @@ graph TB - **Timeout Management**: Configures appropriate timeouts for different model types **Configuration Highlights**: + ```yaml # Envoy listener configuration listeners: @@ -107,6 +108,7 @@ http_filters: **Role**: The brain of the system that makes intelligent routing decisions. **Architecture**: + ```go type OpenAIRouter struct { Config *config.RouterConfig @@ -122,6 +124,7 @@ type OpenAIRouter struct { ``` **Processing Pipeline**: + ```mermaid sequenceDiagram participant E as Envoy @@ -157,6 +160,7 @@ sequenceDiagram The classification system uses ModernBERT models for multiple classification tasks: #### Category Classification + ```mermaid graph LR Query[User Query] --> Tokenizer[ModernBERT Tokenizer] @@ -182,6 +186,7 @@ graph LR ``` #### Multi-Task Architecture + ```python # Conceptual model architecture class SemanticRouter: diff --git a/website/docs/getting-started/configuration.md b/website/docs/getting-started/configuration.md index eeaa79b..a5a6f6e 100644 --- a/website/docs/getting-started/configuration.md +++ b/website/docs/getting-started/configuration.md @@ -265,6 +265,7 @@ default_reasoning_effort: "medium" #### Model Reasoning Configuration Options **Configuration Structure:** + - `name`: A unique identifier for the model family - `patterns`: Array of patterns to match against model names - `reasoning_syntax.type`: How the model expects reasoning mode to be specified @@ -282,6 +283,7 @@ The system supports both simple string patterns and regular expressions for flex - **Multiple patterns**: `["deepseek", "ds-", "^phi.*"]` matches any of these patterns **Regex Pattern Examples:** + ```yaml patterns: - "^gpt-4.*" # Models starting with "gpt-4" @@ -429,11 +431,13 @@ api: The configuration includes preset examples for quick setup. Here's how to use them: **Step 1: Choose your scenario** + - `fast` - For real-time APIs (microsecond to millisecond response times) - `standard` - For typical web APIs (millisecond to second response times) - `slow` - For batch processing or heavy computation (seconds to minutes) **Step 2: Copy the preset values** + ```yaml # Example: Switch to fast API configuration # Copy from preset_examples.fast and paste to the actual config: @@ -442,6 +446,7 @@ size_buckets: [1, 2, 3, 5, 8, 10] ``` **Step 3: Restart the service** + ```bash pkill -f "router" make run-router @@ -463,6 +468,7 @@ The system provides sensible default batch size ranges that work well for most u ### Configuration Examples by Use Case **Real-time Chat API (fast preset)** + ```yaml # Copy these values to your config for sub-millisecond monitoring duration_buckets: [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1] @@ -471,6 +477,7 @@ size_buckets: [1, 2, 3, 5, 8, 10] ``` **E-commerce API (standard preset)** + ```yaml # Copy these values for typical web API response times duration_buckets: [0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10] @@ -479,6 +486,7 @@ size_buckets: [1, 2, 5, 10, 20, 50, 100] ``` **Data Processing Pipeline (slow preset)** + ```yaml # Copy these values for heavy computation workloads duration_buckets: [0.1, 0.5, 1, 5, 10, 30, 60, 120] @@ -492,6 +500,7 @@ batch_size_ranges: ``` **Available Metrics:** + - `batch_classification_requests_total` - Total number of batch requests - `batch_classification_duration_seconds` - Processing duration histogram - `batch_classification_texts_total` - Total number of texts processed @@ -576,6 +585,7 @@ make run-router ### Common Configuration Patterns **Multiple Models:** + ```yaml vllm_endpoints: - name: "math_endpoint" @@ -601,6 +611,7 @@ categories: ``` **Load Balancing:** + ```yaml vllm_endpoints: - name: "endpoint1" @@ -658,6 +669,7 @@ classifier: ### Development vs Production **Development:** + ```yaml # Relaxed settings for testing classifier: @@ -670,6 +682,7 @@ semantic_cache: ``` **Production:** + ```yaml # Strict settings for production classifier: @@ -686,12 +699,14 @@ semantic_cache: ### Common Issues **Invalid YAML syntax:** + ```bash # Validate YAML syntax python -c "import yaml; yaml.safe_load(open('config/config.yaml'))" ``` **Missing model files:** + ```bash # Check if models are downloaded ls -la models/ @@ -699,12 +714,14 @@ ls -la models/ ``` **Endpoint connectivity:** + ```bash # Test your backend server curl -f http://your-server:8000/health ``` **Configuration not taking effect:** + ```bash # Restart the router after config changes make run-router @@ -723,17 +740,20 @@ make test-prompt-guard # Jailbreak protection ### Model Reasoning Configuration Issues **Model not getting reasoning fields:** + - Check that the model name matches a pattern in `model_reasoning_configs` - Verify the pattern syntax (exact matches vs prefixes) - Unknown models will have no reasoning fields applied (this is by design) **Wrong reasoning syntax applied:** + - Ensure the `reasoning_syntax.type` matches your model's expected format - Check the `reasoning_syntax.parameter` name is correct - DeepSeek models typically use `chat_template_kwargs` with `"thinking"` - GPT models typically use `reasoning_effort` **Adding support for new models:** + ```yaml # Add a new model configuration model_reasoning_configs: @@ -745,6 +765,7 @@ model_reasoning_configs: ``` **Testing model reasoning configuration:** + ```bash # Test reasoning with your specific model curl -X POST http://localhost:8801/v1/chat/completions \ @@ -762,6 +783,7 @@ The Semantic Router supports automated configuration generation based on model p ### Benchmarking Workflow 1. **Run MMLU-Pro Evaluation:** + ```bash # Evaluate models using MMLU-Pro benchmark python src/training/model_eval/mmlu_pro_vllm_eval.py \ @@ -774,6 +796,7 @@ The Semantic Router supports automated configuration generation based on model p ``` 2. **Generate Configuration:** + ```bash # Generate config.yaml from benchmark results python src/training/model_eval/result_to_config.py \ @@ -845,6 +868,7 @@ make test ``` This workflow ensures your configuration is: + - Based on actual model performance - Properly tested before deployment - Version controlled for tracking changes diff --git a/website/docs/getting-started/installation.md b/website/docs/getting-started/installation.md index 1ddeaca..59f0ccc 100644 --- a/website/docs/getting-started/installation.md +++ b/website/docs/getting-started/installation.md @@ -25,6 +25,7 @@ cd semantic-router ### 2. Install Dependencies #### Install Go (if not already installed) + ```bash # Check if Go is installed go version @@ -36,6 +37,7 @@ go version ``` #### Install Rust (if not already installed) + ```bash # Check if Rust is installed rustc --version @@ -46,6 +48,7 @@ source ~/.cargo/env ``` #### Install Python (if not already installed) + ```bash # Check if Python is installed python --version @@ -56,6 +59,7 @@ python --version ``` #### Install HuggingFace CLI + ```bash pip install huggingface_hub ``` @@ -68,6 +72,7 @@ make build ``` This command will: + - Build the Rust candle-binding library - Build the Go router binary - Place the executable in `bin/router` @@ -80,6 +85,7 @@ make download-models ``` This downloads the CPU-optimized BERT models for: + - Category classification - PII detection - Jailbreak detection @@ -118,11 +124,13 @@ The default configuration includes example endpoints that you should update for Open two terminals and run: **Terminal 1: Start Envoy Proxy** + ```bash make run-envoy ``` **Terminal 2: Start Semantic Router** + ```bash make run-router ``` diff --git a/website/docs/getting-started/reasoning-routing-quickstart.md b/website/docs/getting-started/reasoning-routing-quickstart.md index 19b4207..d51057e 100644 --- a/website/docs/getting-started/reasoning-routing-quickstart.md +++ b/website/docs/getting-started/reasoning-routing-quickstart.md @@ -1,16 +1,19 @@ # Reasoning Routing Quickstart This short guide shows how to enable and verify β€œreasoning routing” in the Semantic Router: + - Minimal config.yaml fields you need - Example request/response (OpenAI-compatible) - A comprehensive evaluation command you can run Prerequisites + - A running OpenAI-compatible backend for your models (e.g., vLLM or any OpenAI-compatible server). It must be reachable at the addresses you configure under vllm_endpoints (address:port). - Envoy + the router (see Start the router section) 1) Minimal configuration Put this in config/config.yaml (or merge into your existing config). It defines: + - Categories that require reasoning (e.g., math) - Reasoning families for model syntax differences (DeepSeek/Qwen3 use chat_template_kwargs; GPT-OSS/GPT use reasoning_effort) - Which concrete models use which reasoning family @@ -84,6 +87,7 @@ default_model: qwen3-30b ``` Notes + - Reasoning is controlled by categories.use_reasoning and optionally categories.reasoning_effort. - A model only gets reasoning fields if it has a model_config.<MODEL>.reasoning_family that maps to a reasoning_families entry. - DeepSeek/Qwen3 (chat_template_kwargs): the router injects chat_template_kwargs only when reasoning is enabled. When disabled, no chat_template_kwargs are added. @@ -93,6 +97,7 @@ Notes 2) Start the router Option A: Local build + Envoy + - Download classifier models and mappings (required) - make download-models - Build and run the router @@ -102,6 +107,7 @@ Option A: Local build + Envoy - func-e run --config-path config/envoy.yaml --component-log-level "ext_proc:trace,router:trace,http:trace" Option B: Docker Compose + - docker compose up -d - Exposes Envoy at http://localhost:8801 (proxying /v1/* to backends via the router) @@ -109,6 +115,7 @@ Note: Ensure your OpenAI-compatible backend is running and reachable (e.g., http 3) Send example requests Math (reasoning should be ON and effort high) + ```bash curl -sS http://localhost:8801/v1/chat/completions \ -H "Content-Type: application/json" \ @@ -122,6 +129,7 @@ curl -sS http://localhost:8801/v1/chat/completions \ ``` General (reasoning should be OFF) + ```bash curl -sS http://localhost:8801/v1/chat/completions \ -H "Content-Type: application/json" \ @@ -136,10 +144,12 @@ curl -sS http://localhost:8801/v1/chat/completions \ Verify routing via response headers The router does not inject routing metadata into the JSON body. Instead, inspect the response headers added by the router: + - X-Selected-Model - X-Semantic-Destination-Endpoint Example: + ```bash curl -i http://localhost:8801/v1/chat/completions \ -H "Content-Type: application/json" \ @@ -159,6 +169,7 @@ curl -i http://localhost:8801/v1/chat/completions \ You can benchmark the router vs a direct vLLM endpoint across categories using the included script. This runs a ReasoningBench based on MMLU-Pro and produces summaries and plots. Quick start (router + vLLM): + ```bash SAMPLES_PER_CATEGORY=25 \ CONCURRENT_REQUESTS=4 \ @@ -168,6 +179,7 @@ VLLM_MODELS="openai/gpt-oss-20b" \ ``` Router-only benchmark: + ```bash BENCHMARK_ROUTER_ONLY=true \ SAMPLES_PER_CATEGORY=25 \ @@ -177,6 +189,7 @@ ROUTER_MODELS="auto" \ ``` Direct invocation (advanced): + ```bash python bench/router_reason_bench.py \ --run-router \ @@ -191,8 +204,8 @@ python bench/router_reason_bench.py \ ``` Tips + - If your math request doesn’t enable reasoning, confirm the classifier assigns the "math" category with sufficient confidence (see classifier.category_model.threshold) and that the target model has a reasoning_family. - For models without a reasoning_family, the router will not inject reasoning fields even when the category requires reasoning (this is by design to avoid invalid requests). - You can override the effort per category via categories.reasoning_effort or set a global default via default_reasoning_effort. - Ensure your OpenAI-compatible backend is reachable at the configured vllm_endpoints (address:port). If it’s not running, routing will fail even though the router and Envoy are up. - diff --git a/website/docs/intro.md b/website/docs/intro.md index 908904e..5d25394 100644 --- a/website/docs/intro.md +++ b/website/docs/intro.md @@ -7,7 +7,7 @@ sidebar_position: 1 [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](https://github.com/vllm-project/semantic-router/blob/main/LICENSE) [![Hugging Face](https://img.shields.io/badge/πŸ€—%20Hugging%20Face-Community-yellow)](https://huggingface.co/LLM-Semantic-Router) [![Go Report Card](https://goreportcard.com/badge/github.com/vllm-project/semantic-router/src/semantic-router)](https://goreportcard.com/report/github.com/vllm-project/semantic-router/src/semantic-router) -![](https://github.com/vllm-project/semantic-router/workflows/Test%20And%20Build/badge.svg) +![Test And Build](https://github.com/vllm-project/semantic-router/workflows/Test%20And%20Build/badge.svg) An intelligent **Mixture-of-Models (MoM)** router that acts as an Envoy External Processor (ExtProc) to intelligently direct OpenAI API requests to the most suitable backend model from a defined pool. Using BERT-based semantic understanding and classification, it optimizes both performance and cost efficiency. @@ -15,21 +15,25 @@ An intelligent **Mixture-of-Models (MoM)** router that acts as an Envoy External ### 🎯 **Auto-selection of Models** Intelligently routes requests to specialized models based on semantic understanding: + - **Math queries** β†’ Math-specialized models - **Creative writing** β†’ Creative-specialized models - **Code generation** β†’ Code-specialized models - **General queries** β†’ Balanced general-purpose models ### πŸ›‘οΈ **Security & Privacy** + - **PII Detection**: Automatically detects and handles personally identifiable information - **Prompt Guard**: Identifies and blocks jailbreak attempts - **Safe Routing**: Ensures sensitive prompts are handled appropriately ### ⚑ **Performance Optimization** + - **Semantic Cache**: Caches semantic representations to reduce latency - **Tool Selection**: Auto-selects relevant tools to reduce token usage and improve tool selection accuracy ### πŸ—οΈ **Architecture** + - **Envoy ExtProc Integration**: Seamlessly integrates with Envoy proxy - **Dual Implementation**: Available in both Go (with Rust FFI) and Python - **Scalable Design**: Production-ready with comprehensive monitoring @@ -40,7 +44,6 @@ Our testing shows significant improvements in model accuracy through specialized ![Model Accuracy](/img/category_accuracies.png) - ## πŸ› οΈ Architecture Overview ```mermaid @@ -84,6 +87,7 @@ graph TB ## πŸ“ˆ Monitoring & Observability The router provides comprehensive monitoring through: + - **Grafana Dashboard**: Real-time metrics and performance tracking - **Prometheus Metrics**: Detailed routing statistics and performance data - **Request Tracing**: Full visibility into routing decisions and performance diff --git a/website/docs/overview/mixture-of-models.md b/website/docs/overview/mixture-of-models.md index 7ef52f9..ecb5d59 100644 --- a/website/docs/overview/mixture-of-models.md +++ b/website/docs/overview/mixture-of-models.md @@ -9,6 +9,7 @@ The Mixture of Models (MoM) approach represents a fundamental shift from traditi When organizations deploy a single high-performance model (like GPT-4 or Claude-3) for all use cases, they encounter several critical issues: #### 1. **Economic Inefficiency** + ``` Example: Customer Support Chatbot - Simple FAQ: "What are your hours?" @@ -20,6 +21,7 @@ Example: Customer Support Chatbot ``` #### 2. **Performance Suboptimality** + ``` Math Problem: "Solve 2x + 5 = 15" - General GPT-4: Good performance, but overkill @@ -33,12 +35,14 @@ Creative Writing: "Write a poem about spring" ``` #### 3. **Resource Waste** + - **Computing Power**: Using a 1.8T parameter model for simple classification - **Memory**: Loading massive models for lightweight tasks - **Latency**: Slower inference for tasks that could be handled quickly - **Throughput**: Lower requests/second due to model size #### 4. **Operational Risks** + - **Single Point of Failure**: Model downtime affects entire system - **Vendor Lock-in**: Dependent on single provider's availability and pricing - **Limited Flexibility**: Cannot optimize for specific use cases @@ -65,6 +69,7 @@ graph TB ``` **Cost Impact Analysis:** + ```python # Traditional approach traditional_cost = 100000 * 0.03 # All queries to GPT-4 @@ -128,6 +133,7 @@ graph TB **Challenge**: Balance customer satisfaction with operational costs #### Before MoM: + ``` Setup: GPT-4 for all customer service queries Daily Cost: $4,500 @@ -139,6 +145,7 @@ Issues: ``` #### After MoM Implementation: + ```python # Query distribution and routing routing_strategy = { @@ -170,6 +177,7 @@ routing_strategy = { ``` #### Results: + - **Cost Reduction**: 72% ($4,500 β†’ $1,260/day) - **Customer Satisfaction**: +12% (specialized models performed better) - **Response Time**: -35% average latency @@ -182,6 +190,7 @@ routing_strategy = { **Use Cases**: Code review, documentation generation, bug analysis #### Implementation Strategy: + ```mermaid graph TB CodeQuery[Code Query] --> Classifier[Code Intent Classifier] @@ -198,6 +207,7 @@ graph TB ``` #### Performance Metrics: + | Metric | Before MoM | After MoM | Improvement | |--------|------------|-----------|-------------| | Daily Cost | $750 | $285 | 62% reduction | @@ -212,6 +222,7 @@ graph TB **Challenge**: Provide personalized learning assistance across multiple subjects #### Specialized Model Deployment: + ```python subject_routing = { "mathematics": { @@ -253,6 +264,7 @@ subject_routing = { ``` #### Educational Impact: + - **Cost Efficiency**: $3,000/day β†’ $890/day (70% reduction) - **Learning Outcomes**: +23% improvement in problem-solving scores - **Personalization**: Better subject-specific assistance @@ -327,6 +339,7 @@ scaling_rules = { ### Challenge 1: Router Accuracy **Problem**: Incorrect routing leads to poor user experience **Solution**: + - Multi-stage classification with confidence scores - Fallback mechanisms for uncertain classifications - Continuous learning from user feedback @@ -347,6 +360,7 @@ def route_query(query): ### Challenge 2: Latency Overhead **Problem**: Classification adds latency to each request **Solution**: + - Optimized lightweight classifiers (<10ms inference) - Parallel processing of classification and request preparation - Caching of classification results for similar queries @@ -354,6 +368,7 @@ def route_query(query): ### Challenge 3: Context Preservation **Problem**: Switching models mid-conversation loses context **Solution**: + - Conversation-aware routing (same model for session) - Context summarization and transfer between models - Hybrid approaches with context bridges @@ -389,6 +404,7 @@ print(f"ROI achieved in: {roi_months:.1f} months") ``` **Output:** + ``` 12-month savings: $165,000.00 ROI achieved in: 3.6 months diff --git a/website/docs/overview/semantic-router-overview.md b/website/docs/overview/semantic-router-overview.md index 61e8369..8b30ebc 100644 --- a/website/docs/overview/semantic-router-overview.md +++ b/website/docs/overview/semantic-router-overview.md @@ -16,6 +16,7 @@ Semantic routing is the process of **dynamically selecting the most suitable lan ## The Evolution of LLM Routing ### Traditional Approach: One-Size-Fits-All + ```mermaid graph LR Query[User Query] --> Model[Single LLM
GPT-4, Claude, etc.] @@ -30,6 +31,7 @@ graph LR - No flexibility in model selection ### Modern Approach: Semantic Routing + ```mermaid graph TB Query[User Query] --> Router[Semantic Router
BERT Classifier] @@ -124,6 +126,7 @@ graph TB - **Benchmark Performance**: Evaluated on MMLU, GSM8K, and MT Bench **Training Approach:** + ```python # RouteLLM training conceptually preference_data = load_chatbot_arena_data() # Human comparisons @@ -147,6 +150,7 @@ GPT-5 introduces a revolutionary **router-as-coordinator** architecture: - **Efficiency**: Computation flows along optimal paths **Operational Flow:** + ```mermaid sequenceDiagram participant User @@ -173,6 +177,7 @@ sequenceDiagram ## Why Mixture of Models is Superior ### 1. **Economic Efficiency** + ``` Traditional: All queries β†’ GPT-4 ($0.03/1K tokens) MoM Routing: @@ -182,16 +187,19 @@ MoM Routing: ``` ### 2. **Performance Specialization** + - **Domain Expertise**: Code generation models excel at programming tasks - **Task Optimization**: Math models optimized for numerical reasoning - **Context Efficiency**: Smaller models for simpler tasks reduce latency ### 3. **Flexibility and Scalability** + - **Model Independence**: Each model can be updated independently - **Provider Diversity**: Mix OpenAI, Anthropic, local, and fine-tuned models - **Easy Extensions**: Add new specialized models without system redesign ### 4. **Risk Distribution** + - **Vendor Independence**: Not locked into single provider - **Failure Isolation**: One model failure doesn't affect others - **A/B Testing**: Easy to test new models in production @@ -201,6 +209,7 @@ MoM Routing: ### Case Study: Enterprise API Gateway **Before Semantic Routing:** + ``` Workload: 100K queries/day Model: GPT-4 for all queries @@ -209,6 +218,7 @@ Quality: High but inconsistent for simple tasks ``` **After Semantic Routing:** + ``` Workload: 100K queries/day distributed as: - 60% simple β†’ GPT-3.5: $120/day diff --git a/website/docs/training/training-overview.md b/website/docs/training/training-overview.md index b049afd..be4f000 100644 --- a/website/docs/training/training-overview.md +++ b/website/docs/training/training-overview.md @@ -41,18 +41,21 @@ graph TB [ModernBERT](https://arxiv.org/abs/2412.13663) represents the latest evolution in BERT architecture with several key improvements over traditional BERT models: #### 1. **Enhanced Architecture** + - **Rotary Position Embedding (RoPE)**: Better handling of positional information - **GeGLU Activation**: Improved gradient flow and representation capacity - **Attention Bias Removal**: Cleaner attention mechanisms - **Modern Layer Normalization**: Better training stability #### 2. **Training Improvements** + - **Longer Context**: Trained on sequences up to 8,192 tokens vs BERT's 512 - **Better Data**: Trained on higher-quality, more recent datasets - **Improved Tokenization**: More efficient vocabulary and tokenization - **Anti-overfitting Techniques**: Built-in regularization improvements #### 3. **Performance Benefits** + ```python # Performance comparison on classification tasks model_performance = { @@ -212,6 +215,7 @@ class UnifiedBERTFinetuning: **Purpose**: Route queries to specialized models based on academic/professional domains. #### Dataset: MMLU-Pro Academic Domains + ```python # Dataset composition mmlu_categories = { @@ -245,6 +249,7 @@ mmlu_categories = { ``` #### Training Configuration + ```yaml model_config: base_model: "modernbert-base" @@ -264,6 +269,7 @@ evaluation_metrics: ``` #### Model Performance + ```python category_performance = { "overall_accuracy": 0.942, @@ -286,6 +292,7 @@ category_performance = { **Purpose**: Identify personally identifiable information to protect user privacy. #### Dataset: Microsoft Presidio + Custom Synthetic Data + ```python # PII entity types and examples pii_entities = { @@ -323,6 +330,7 @@ pii_entities = { ``` #### Training Approach: Token Classification + ```python class PIITokenClassifier: def __init__(self): @@ -353,6 +361,7 @@ class PIITokenClassifier: ``` #### Performance Metrics + ```python pii_performance = { "overall_f1": 0.957, @@ -376,6 +385,7 @@ pii_performance = { **Purpose**: Identify and block attempts to circumvent AI safety measures. #### Dataset: Jailbreak Classification Dataset + ```python jailbreak_dataset = { "benign": { @@ -402,6 +412,7 @@ jailbreak_dataset = { ``` #### Training Strategy + ```python class JailbreakDetector: def __init__(self): @@ -423,6 +434,7 @@ class JailbreakDetector: ``` #### Performance Analysis + ```python jailbreak_performance = { "overall_metrics": { @@ -451,6 +463,7 @@ jailbreak_performance = { **Purpose**: Classify queries for tool selection and function calling optimization. #### Dataset: Glaive Function Calling v2 + ```python intent_categories = { "information_retrieval": { @@ -499,6 +512,7 @@ intent_categories = { ## Training Infrastructure ### Hardware Requirements + ```yaml training_infrastructure: gpu_requirements: @@ -517,6 +531,7 @@ training_infrastructure: ``` ### Training Pipeline Automation + ```python class TrainingPipeline: def __init__(self, config_path):