From 01229cb6ef902e67bb53d270b31ec3c88823b391 Mon Sep 17 00:00:00 2001 From: mzfive Date: Wed, 13 Aug 2025 20:52:34 +0200 Subject: [PATCH] Release MLX Knife 1.0-rc2: Enhanced Memory Management & Exception Safety **Key Improvements** - Robust exception handling during model loading with guaranteed cleanup - Protection against nested context manager usage in MLXRunner - Safe cleanup that handles partial loading failures gracefully - Exception-resilient cache clearing operations - Safe tokenizer attribute access with proper defaults - Graceful memory statistics handling when metrics unavailable - Comprehensive unit test coverage for memory management edge cases **Changes** - Updated version to 1.0-rc2 across all documentation files - Enhanced MLXRunner context manager with bulletproof exception safety - Added comprehensive unit tests for memory management scenarios - Improved error handling for partial model loading failures - Updated test coverage documentation (96/96 tests passing) - Refined README to focus on key features rather than test metrics This release focuses on production-ready memory management and exception safety, making MLX Knife more robust for real-world usage scenarios. --- CHANGELOG.md | 18 +- README.md | 12 +- SECURITY.md | 2 +- TESTING.md | 26 +-- mlx_knife/__init__.py | 2 +- mlx_knife/mlx_runner.py | 106 ++++++---- tests/unit/test_mlx_runner_memory.py | 295 +++++++++++++++++++++++++++ 7 files changed, 404 insertions(+), 57 deletions(-) create mode 100644 tests/unit/test_mlx_runner_memory.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 211c8bf..d47dd4d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,21 @@ # Changelog +## [1.0-rc2] - 2025-08-13 + +### Enhanced +- Robust exception handling during model loading with guaranteed cleanup +- Protection against nested context manager usage +- Safe cleanup that handles partial loading failures +- Exception-resilient cache clearing (won't fail if cache operations error) +- Safe tokenizer attribute access using getattr() with defaults +- Graceful memory stats handling when metrics unavailable +- Comprehensive unit test coverage for all memory management edge cases + +### Fixed +- Memory management edge cases in MLXRunner context manager +- Exception safety during model loading and cleanup operations +- Improved error handling for partial model loading failures + ## [1.0-rc1] - 2025-08-12 ### Added @@ -10,5 +26,5 @@ - Multi-Python support (3.9-3.13) - Comprehensive test suite (86/86 passing) -### Known Issues +## Known Issues - See GitHub Issues for tracking \ No newline at end of file diff --git a/README.md b/README.md index 6c2e92b..05f8f92 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,7 @@ A lightweight, ollama-like CLI for managing and running MLX models on Apple Silicon. **Designed for personal, local use** - perfect for individual developers and researchers working with MLX models. -**Current Version**: 1.0-rc1 (August 2025) +**Current Version**: 1.0-rc2 (August 2025) [![GitHub Release](https://img.shields.io/github/v/release/mzau/mlx-knife)](https://github.com/mzau/mlx-knife/releases) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) @@ -14,7 +14,7 @@ A lightweight, ollama-like CLI for managing and running MLX models on Apple Sili [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/) [![Apple Silicon](https://img.shields.io/badge/Apple%20Silicon-M1%2FM2%2FM3-green.svg)](https://support.apple.com/en-us/HT211814) [![MLX](https://img.shields.io/badge/MLX-Latest-orange.svg)](https://github.com/ml-explore/mlx) -[![Tests](https://img.shields.io/badge/tests-86%2F86%20passing-brightgreen.svg)](#testing) +[![Tests](https://img.shields.io/badge/tests-96%2F96%20passing-brightgreen.svg)](#testing) ## Features @@ -43,7 +43,7 @@ A lightweight, ollama-like CLI for managing and running MLX models on Apple Sili - **Memory Insights**: See GPU memory usage after model loading and generation - **Dynamic Stop Tokens**: Automatic detection and filtering of model-specific stop tokens - **Customizable Generation**: Control temperature, max_tokens, top_p, and repetition penalty -- **RAII Memory Management**: Context manager pattern ensures automatic cleanup and no memory leaks +- **Context-Managed Memory**: Context manager pattern ensures automatic cleanup and prevents memory leaks - **Exception-Safe**: Robust error handling with guaranteed resource cleanup ## Installation @@ -298,7 +298,7 @@ mlxk run bert-base-uncased ## Testing -MLX Knife includes comprehensive test coverage with **86/86 tests passing** across all supported Python versions. +MLX Knife includes comprehensive test coverage across all supported Python versions. ### Quick Start @@ -346,7 +346,7 @@ Stop tokens are dynamically extracted from each model's tokenizer: - Common tokens verified as single-token entities ### Memory Management -- **RAII Pattern**: Context manager ensures automatic resource cleanup +- **Context Managers**: Automatic resource cleanup with Python context managers - **Exception-Safe**: Model cleanup guaranteed even on errors - **Baseline Tracking**: Memory captured before model loading - **Real-time Monitoring**: GPU memory tracking via `mlx.core.get_active_memory()` @@ -416,5 +416,5 @@ Copyright (c) 2025 The BROKE team đŸĻĢ

Made with â¤ī¸ by The BROKE team BROKE Logo
- Version 1.0-rc1 | August 2025 + Version 1.0-rc2 | August 2025

\ No newline at end of file diff --git a/SECURITY.md b/SECURITY.md index b65b749..6bc5390 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -98,7 +98,7 @@ mlxk server --host 0.0.0.0 --port 8000 | Version | Supported | | ------- | ------------------ | -| 1.0-rc1 | :white_check_mark: | +| 1.0-rc2 | :white_check_mark: | | < 1.0 | :x: | ## Additional Resources diff --git a/TESTING.md b/TESTING.md index 17a19c2..1ef0da8 100644 --- a/TESTING.md +++ b/TESTING.md @@ -141,12 +141,12 @@ pytest --durations=10 pytest -n auto ``` -## Test Results Summary (1.0-rc1) +## Test Results Summary (1.0-rc2) ### ✅ Current Test Status (August 2025) ``` -Total Tests: 86/86 passing (100% ✅) +Total Tests: 96/96 passing (100% ✅) ├── ✅ Integration Tests: 61 passing ├── ✅ Unit Tests: 25 passing └── ✅ Real MLX Model Tests: All passing with Phi-3-mini @@ -156,7 +156,7 @@ Total Tests: 86/86 passing (100% ✅) - ✅ **Complete test coverage** - All critical functionality validated - ✅ **Real model execution** - No mocked tests - ✅ **Process hygiene confirmed** - No zombie processes, clean shutdowns -- ✅ **Memory management robust** - RAII pattern prevents leaks +- ✅ **Memory management robust** - Context managers prevent leaks - ✅ **Exception safety verified** - Context managers work correctly ### Test Categories Breakdown @@ -173,7 +173,7 @@ Total Tests: 86/86 passing (100% ✅) ## Python Version Compatibility ### Compatibility Status -MLX Knife 1.0-rc1 is fully compatible with Python 3.9-3.13. Comprehensive verification completed with 86/86 tests passing on all supported versions. +MLX Knife 1.0-rc2 is fully compatible with Python 3.9-3.13. Comprehensive verification completed with 96/96 tests passing on all supported versions. ### Manual Multi-Python Testing @@ -195,11 +195,11 @@ deactivate && rm -rf test_39 | Python Version | Status | Tests Passing | |----------------|--------|---------------| -| 3.9.6 (macOS) | ✅ Verified | 86/86 | -| 3.10.x | ✅ Verified | 86/86 | -| 3.11.x | ✅ Verified | 86/86 | -| 3.12.x | ✅ Verified | 86/86 | -| 3.13.x | ✅ Verified | 86/86 | +| 3.9.6 (macOS) | ✅ Verified | 96/96 | +| 3.10.x | ✅ Verified | 96/96 | +| 3.11.x | ✅ Verified | 96/96 | +| 3.12.x | ✅ Verified | 96/96 | +| 3.13.x | ✅ Verified | 96/96 | All versions tested with real MLX model execution (Phi-3-mini-4k-instruct-4bit). @@ -339,20 +339,20 @@ When submitting PRs, please include: Platform: macOS 14.5, M2 Pro Python: 3.11.6 Model: Phi-3-mini-4k-instruct-4bit - Results: 86/86 tests passed + Results: 96/96 tests passed ``` 3. **Any issues encountered** and how you resolved them ## Summary -**MLX Knife 1.0-rc1 Testing Status:** +**MLX Knife 1.0-rc2 Testing Status:** -✅ **Production Ready** - 86/86 tests passing +✅ **Production Ready** - 96/96 tests passing ✅ **Multi-Python Support** - Python 3.9-3.13 verified ✅ **Code Quality** - ruff/mypy integration working ✅ **Real Model Testing** - Phi-3-mini execution confirmed -✅ **Memory Management** - RAII pattern prevents leaks +✅ **Memory Management** - Context managers prevent leaks ✅ **Exception Safety** - Context managers ensure cleanup This comprehensive testing framework validates MLX Knife's **production readiness** through local testing on real Apple Silicon hardware with actual MLX models. \ No newline at end of file diff --git a/mlx_knife/__init__.py b/mlx_knife/__init__.py index 6fb0c15..8339512 100644 --- a/mlx_knife/__init__.py +++ b/mlx_knife/__init__.py @@ -4,7 +4,7 @@ A lightweight, ollama-like CLI for managing and running MLX models on Apple Sili Provides native MLX execution with streaming output and interactive chat capabilities. """ -__version__ = "1.0-rc1" +__version__ = "1.0-rc2" __author__ = "The BROKE team" __email__ = "broke@gmx.eu" __license__ = "MIT" diff --git a/mlx_knife/mlx_runner.py b/mlx_knife/mlx_runner.py index 030d2b4..80d5c71 100644 --- a/mlx_knife/mlx_runner.py +++ b/mlx_knife/mlx_runner.py @@ -34,14 +34,26 @@ class MLXRunner: self._stop_tokens = None # Will be populated from tokenizer self.verbose = verbose self._model_loaded = False + self._context_entered = False # Prevent nested context usage def __enter__(self): """Context manager entry - loads the model.""" - self.load_model() - return self + if self._context_entered: + raise RuntimeError("MLXRunner context manager cannot be entered multiple times") + + self._context_entered = True + try: + self.load_model() + return self + except Exception: + # If load_model fails, ensure cleanup happens + self._context_entered = False + self.cleanup() + raise def __exit__(self, exc_type, exc_val, exc_tb): """Context manager exit - cleans up the model.""" + self._context_entered = False self.cleanup() return False # Don't suppress exceptions @@ -57,39 +69,54 @@ class MLXRunner: start_time = time.time() # Capture baseline memory before loading - mx.clear_cache() + try: + mx.clear_cache() + except Exception: + pass # Continue even if cache clear fails self._memory_baseline = mx.get_active_memory() / 1024**3 - # Load model and tokenizer - self.model, self.tokenizer = load( - str(self.model_path), - adapter_path=self.adapter_path - ) + try: + # Load model and tokenizer + self.model, self.tokenizer = load( + str(self.model_path), + adapter_path=self.adapter_path + ) - load_time = time.time() - start_time - current_memory = mx.get_active_memory() / 1024**3 - model_memory = current_memory - self._memory_baseline + load_time = time.time() - start_time + current_memory = mx.get_active_memory() / 1024**3 + model_memory = current_memory - self._memory_baseline - if self.verbose: - print(f"Model loaded in {load_time:.1f}s") - print(f"Memory: {model_memory:.1f}GB model, {current_memory:.1f}GB total") + if self.verbose: + print(f"Model loaded in {load_time:.1f}s") + print(f"Memory: {model_memory:.1f}GB model, {current_memory:.1f}GB total") - # Extract stop tokens from tokenizer - self._extract_stop_tokens() - self._model_loaded = True + # Extract stop tokens from tokenizer + self._extract_stop_tokens() + self._model_loaded = True + + except Exception as e: + # Ensure partial state is cleaned up on failure + self.model = None + self.tokenizer = None + self._stop_tokens = None + self._model_loaded = False + # Clear any memory that might have been allocated + mx.clear_cache() + raise RuntimeError(f"Failed to load model from {self.model_path}: {e}") from e def _extract_stop_tokens(self): """Extract stop tokens from the tokenizer dynamically.""" self._stop_tokens = set() # Primary source: eos_token - if hasattr(self.tokenizer, 'eos_token') and self.tokenizer.eos_token: - self._stop_tokens.add(self.tokenizer.eos_token) + eos_token = getattr(self.tokenizer, 'eos_token', None) + if eos_token: + self._stop_tokens.add(eos_token) # Also check pad_token if it's different from eos_token - if hasattr(self.tokenizer, 'pad_token') and self.tokenizer.pad_token: - if self.tokenizer.pad_token != self.tokenizer.eos_token: - self._stop_tokens.add(self.tokenizer.pad_token) + pad_token = getattr(self.tokenizer, 'pad_token', None) + if pad_token and pad_token != eos_token: + self._stop_tokens.add(pad_token) # Check additional_special_tokens if hasattr(self.tokenizer, 'additional_special_tokens'): @@ -125,17 +152,15 @@ class MLXRunner: print(f"Stop tokens: {self._stop_tokens}") def cleanup(self): - """Clean up model resources and clear GPU memory.""" - if not self._model_loaded: - if self.verbose: - print("No model to cleanup") - return - - if self.verbose: + """Clean up model resources and clear GPU memory. + + This method is safe to call multiple times and handles partial state cleanup. + """ + if self.verbose and self._model_loaded: memory_before = mx.get_active_memory() / 1024**3 print(f"Cleaning up model (memory before: {memory_before:.1f}GB)...") - # Clear model references + # Always clean up, even if model wasn't fully loaded self.model = None self.tokenizer = None self._stop_tokens = None @@ -144,12 +169,18 @@ class MLXRunner: # Force garbage collection and clear MLX cache import gc gc.collect() - mx.clear_cache() + try: + mx.clear_cache() + except Exception: + pass # Continue cleanup even if cache clear fails if self.verbose: memory_after = mx.get_active_memory() / 1024**3 - memory_freed = memory_before - memory_after - print(f"Cleanup complete (memory after: {memory_after:.1f}GB, freed: {memory_freed:.1f}GB)") + if 'memory_before' in locals(): + memory_freed = memory_before - memory_after + print(f"Cleanup complete (memory after: {memory_after:.1f}GB, freed: {memory_freed:.1f}GB)") + else: + print(f"Cleanup complete (memory after: {memory_after:.1f}GB)") def generate_streaming( self, @@ -472,8 +503,13 @@ class MLXRunner: Returns: Dictionary with memory statistics in GB """ - current_memory = mx.get_active_memory() / 1024**3 - peak_memory = mx.get_peak_memory() / 1024**3 + try: + current_memory = mx.get_active_memory() / 1024**3 + peak_memory = mx.get_peak_memory() / 1024**3 + except Exception: + # Return zeros if memory stats unavailable + current_memory = 0.0 + peak_memory = 0.0 return { "current_gb": current_memory, diff --git a/tests/unit/test_mlx_runner_memory.py b/tests/unit/test_mlx_runner_memory.py new file mode 100644 index 0000000..aa9f722 --- /dev/null +++ b/tests/unit/test_mlx_runner_memory.py @@ -0,0 +1,295 @@ +""" +Unit tests for MLXRunner memory management robustness. + +Tests context manager implementation, exception handling, +and cleanup guarantees without requiring actual MLX models. +""" +import unittest +from unittest.mock import MagicMock, patch, PropertyMock +import gc + + +class TestMLXRunnerMemoryManagement(unittest.TestCase): + """Test MLXRunner memory management robustness.""" + + @patch('mlx_knife.mlx_runner.mx') + @patch('mlx_knife.mlx_runner.load') + def test_context_manager_basic_flow(self, mock_load, mock_mx): + """Test basic context manager flow with successful execution.""" + from mlx_knife.mlx_runner import MLXRunner + + # Setup mocks + mock_model = MagicMock() + mock_tokenizer = MagicMock() + mock_tokenizer.eos_token = '' + mock_tokenizer.eos_token_id = 2 + mock_load.return_value = (mock_model, mock_tokenizer) + mock_mx.get_active_memory.return_value = 1024 * 1024 * 1024 # 1GB + + # Test successful context manager usage + with MLXRunner("test_model", verbose=False) as runner: + self.assertIsNotNone(runner.model) + self.assertIsNotNone(runner.tokenizer) + self.assertTrue(runner._model_loaded) + self.assertTrue(runner._context_entered) + + # After exiting context, model should be cleaned up + self.assertIsNone(runner.model) + self.assertIsNone(runner.tokenizer) + self.assertFalse(runner._model_loaded) + self.assertFalse(runner._context_entered) + + # Verify cleanup was called + mock_mx.clear_cache.assert_called() + + @patch('mlx_knife.mlx_runner.mx') + @patch('mlx_knife.mlx_runner.load') + def test_context_manager_exception_in_load(self, mock_load, mock_mx): + """Test cleanup when exception occurs during model loading.""" + from mlx_knife.mlx_runner import MLXRunner + + # Setup mock to fail during load + mock_load.side_effect = RuntimeError("Model loading failed") + mock_mx.get_active_memory.return_value = 1024 * 1024 * 1024 + + # Test that exception is propagated and cleanup happens + with self.assertRaises(RuntimeError) as cm: + with MLXRunner("test_model", verbose=False) as runner: + pass # Should never reach here + + self.assertIn("Failed to load model", str(cm.exception)) + + # Verify cleanup was called even on failure + mock_mx.clear_cache.assert_called() + + @patch('mlx_knife.mlx_runner.mx') + @patch('mlx_knife.mlx_runner.load') + def test_context_manager_exception_in_body(self, mock_load, mock_mx): + """Test cleanup when exception occurs in context body.""" + from mlx_knife.mlx_runner import MLXRunner + + # Setup successful mocks + mock_model = MagicMock() + mock_tokenizer = MagicMock() + mock_tokenizer.eos_token = '' + mock_tokenizer.eos_token_id = 2 + mock_load.return_value = (mock_model, mock_tokenizer) + mock_mx.get_active_memory.return_value = 1024 * 1024 * 1024 + + # Test exception in context body + with self.assertRaises(ValueError): + with MLXRunner("test_model", verbose=False) as runner: + self.assertTrue(runner._model_loaded) + raise ValueError("User error") + + # Cleanup should still happen + self.assertIsNone(runner.model) + self.assertIsNone(runner.tokenizer) + self.assertFalse(runner._model_loaded) + mock_mx.clear_cache.assert_called() + + @patch('mlx_knife.mlx_runner.mx') + @patch('mlx_knife.mlx_runner.load') + def test_prevent_nested_context_usage(self, mock_load, mock_mx): + """Test that nested context manager usage is prevented.""" + from mlx_knife.mlx_runner import MLXRunner + + # Setup mocks + mock_model = MagicMock() + mock_tokenizer = MagicMock() + mock_tokenizer.eos_token = '' + mock_tokenizer.eos_token_id = 2 + mock_load.return_value = (mock_model, mock_tokenizer) + mock_mx.get_active_memory.return_value = 1024 * 1024 * 1024 + + runner = MLXRunner("test_model", verbose=False) + + # First context should work + with runner: + self.assertTrue(runner._context_entered) + + # Nested context should fail + with self.assertRaises(RuntimeError) as cm: + with runner: + pass + + self.assertIn("cannot be entered multiple times", str(cm.exception)) + + # After exiting, should be able to use again + self.assertFalse(runner._context_entered) + + # Second usage should work + with runner: + self.assertTrue(runner._context_entered) + + @patch('mlx_knife.mlx_runner.mx') + @patch('mlx_knife.mlx_runner.load') + def test_partial_loading_failure_cleanup(self, mock_load, mock_mx): + """Test cleanup when loading partially succeeds then fails.""" + from mlx_knife.mlx_runner import MLXRunner + + # Setup mock to partially succeed + mock_model = MagicMock() + mock_tokenizer = MagicMock() + + # Missing required attributes to trigger failure in _extract_stop_tokens + del mock_tokenizer.eos_token + del mock_tokenizer.eos_token_id + mock_tokenizer.encode.side_effect = Exception("Tokenizer error") + + mock_load.return_value = (mock_model, mock_tokenizer) + mock_mx.get_active_memory.return_value = 1024 * 1024 * 1024 + + runner = MLXRunner("test_model", verbose=False) + + # Load should succeed even with tokenizer issues + try: + runner.load_model() + # Model should be loaded even if stop token extraction had issues + self.assertIsNotNone(runner.model) + self.assertIsNotNone(runner.tokenizer) + finally: + # Cleanup should work regardless + runner.cleanup() + self.assertIsNone(runner.model) + self.assertIsNone(runner.tokenizer) + mock_mx.clear_cache.assert_called() + + @patch('mlx_knife.mlx_runner.mx') + def test_cleanup_idempotency(self, mock_mx): + """Test that cleanup can be called multiple times safely.""" + from mlx_knife.mlx_runner import MLXRunner + + mock_mx.get_active_memory.return_value = 1024 * 1024 * 1024 + + runner = MLXRunner("test_model", verbose=False) + runner.model = MagicMock() + runner.tokenizer = MagicMock() + runner._model_loaded = True + + # Call cleanup multiple times + for _ in range(3): + runner.cleanup() + self.assertIsNone(runner.model) + self.assertIsNone(runner.tokenizer) + self.assertFalse(runner._model_loaded) + + # Should have been called at least once + mock_mx.clear_cache.assert_called() + + @patch('mlx_knife.mlx_runner.mx') + @patch('mlx_knife.mlx_runner.load') + def test_memory_baseline_tracking(self, mock_load, mock_mx): + """Test memory baseline is properly tracked.""" + from mlx_knife.mlx_runner import MLXRunner + + # Setup mocks + mock_model = MagicMock() + mock_tokenizer = MagicMock() + mock_tokenizer.eos_token = '' + mock_tokenizer.eos_token_id = 2 + mock_load.return_value = (mock_model, mock_tokenizer) + + # Simulate memory growth during loading + memory_values = [ + 1 * 1024**3, # 1GB baseline + 5 * 1024**3, # 5GB after loading + 5 * 1024**3, # 5GB when querying stats + ] + mock_mx.get_active_memory.side_effect = memory_values + + runner = MLXRunner("test_model", verbose=False) + runner.load_model() + + # Check baseline was captured + self.assertEqual(runner._memory_baseline, 1.0) # 1GB + + # Check memory usage calculation + memory_stats = runner.get_memory_usage() + self.assertEqual(memory_stats["model_gb"], 4.0) # 5GB - 1GB = 4GB + + @patch('mlx_knife.mlx_runner.mx') + @patch('mlx_knife.mlx_runner.load') + def test_generate_without_loading(self, mock_load, mock_mx): + """Test that generate methods fail gracefully without loaded model.""" + from mlx_knife.mlx_runner import MLXRunner + + runner = MLXRunner("test_model", verbose=False) + + # Try to generate without loading + with self.assertRaises(RuntimeError) as cm: + list(runner.generate_streaming("test prompt")) + self.assertIn("Model not loaded", str(cm.exception)) + + with self.assertRaises(RuntimeError) as cm: + runner.generate_batch("test prompt") + self.assertIn("Model not loaded", str(cm.exception)) + + @patch('mlx_knife.mlx_runner.mx') + @patch('mlx_knife.mlx_runner.load') + def test_server_usage_without_context_manager(self, mock_load, mock_mx): + """Test server-style usage without context manager.""" + from mlx_knife.mlx_runner import MLXRunner + + # Setup mocks + mock_model = MagicMock() + mock_tokenizer = MagicMock() + mock_tokenizer.eos_token = '' + mock_tokenizer.eos_token_id = 2 + mock_load.return_value = (mock_model, mock_tokenizer) + mock_mx.get_active_memory.return_value = 1024 * 1024 * 1024 + + # Server style: manual load and cleanup + runner = MLXRunner("test_model", verbose=False) + + try: + runner.load_model() + self.assertTrue(runner._model_loaded) + self.assertIsNotNone(runner.model) + + # Simulate server keeping model loaded + # and potentially switching models + runner.cleanup() + self.assertFalse(runner._model_loaded) + self.assertIsNone(runner.model) + + # Load again (simulating model switch) + runner.load_model() + self.assertTrue(runner._model_loaded) + + finally: + # Ensure cleanup happens + runner.cleanup() + self.assertFalse(runner._model_loaded) + + @patch('mlx_knife.mlx_runner.mx') + @patch('mlx_knife.mlx_runner.load') + def test_exception_during_cleanup(self, mock_load, mock_mx): + """Test that cleanup handles exceptions gracefully.""" + from mlx_knife.mlx_runner import MLXRunner + + # Setup mocks + mock_model = MagicMock() + mock_tokenizer = MagicMock() + mock_tokenizer.eos_token = '' + mock_tokenizer.eos_token_id = 2 + mock_load.return_value = (mock_model, mock_tokenizer) + mock_mx.get_active_memory.return_value = 1024 * 1024 * 1024 + + # Make clear_cache raise an exception + mock_mx.clear_cache.side_effect = Exception("Cache clear failed") + + runner = MLXRunner("test_model", verbose=False) + runner.load_model() + + # Cleanup should complete even if mx.clear_cache fails + runner.cleanup() # Should not raise + + # State should still be cleaned + self.assertIsNone(runner.model) + self.assertIsNone(runner.tokenizer) + self.assertFalse(runner._model_loaded) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file