From 01229cb6ef902e67bb53d270b31ec3c88823b391 Mon Sep 17 00:00:00 2001
From: mzfive <mail@zedfive.eu>
Date: Wed, 13 Aug 2025 20:52:34 +0200
Subject: [PATCH]  Release MLX Knife 1.0-rc2: Enhanced Memory Management &
 Exception Safety

 **Key Improvements**
  - Robust exception handling during model loading with guaranteed cleanup
  - Protection against nested context manager usage in MLXRunner
  - Safe cleanup that handles partial loading failures gracefully
  - Exception-resilient cache clearing operations
  - Safe tokenizer attribute access with proper defaults
  - Graceful memory statistics handling when metrics unavailable
  - Comprehensive unit test coverage for memory management edge cases

 **Changes**
  - Updated version to 1.0-rc2 across all documentation files
  - Enhanced MLXRunner context manager with bulletproof exception safety
  - Added comprehensive unit tests for memory management scenarios
  - Improved error handling for partial model loading failures
  - Updated test coverage documentation (96/96 tests passing)
  - Refined README to focus on key features rather than test metrics

  This release focuses on production-ready memory management and exception
  safety, making MLX Knife more robust for real-world usage scenarios.
---
 CHANGELOG.md                         |  18 +-
 README.md                            |  12 +-
 SECURITY.md                          |   2 +-
 TESTING.md                           |  26 +--
 mlx_knife/__init__.py                |   2 +-
 mlx_knife/mlx_runner.py              | 106 ++++++----
 tests/unit/test_mlx_runner_memory.py | 295 +++++++++++++++++++++++++++
 7 files changed, 404 insertions(+), 57 deletions(-)
 create mode 100644 tests/unit/test_mlx_runner_memory.py

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 211c8bf..d47dd4d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,21 @@
 # Changelog
 
+## [1.0-rc2] - 2025-08-13
+
+### Enhanced
+- Robust exception handling during model loading with guaranteed cleanup
+- Protection against nested context manager usage 
+- Safe cleanup that handles partial loading failures
+- Exception-resilient cache clearing (won't fail if cache operations error)
+- Safe tokenizer attribute access using getattr() with defaults
+- Graceful memory stats handling when metrics unavailable
+- Comprehensive unit test coverage for all memory management edge cases
+
+### Fixed
+- Memory management edge cases in MLXRunner context manager
+- Exception safety during model loading and cleanup operations
+- Improved error handling for partial model loading failures
+
 ## [1.0-rc1] - 2025-08-12
 
 ### Added
@@ -10,5 +26,5 @@
 - Multi-Python support (3.9-3.13)
 - Comprehensive test suite (86/86 passing)
 
-### Known Issues
+## Known Issues
 - See GitHub Issues for tracking
\ No newline at end of file
diff --git a/README.md b/README.md
index 6c2e92b..05f8f92 100644
--- a/README.md
+++ b/README.md
@@ -6,7 +6,7 @@
 
 A lightweight, ollama-like CLI for managing and running MLX models on Apple Silicon. **Designed for personal, local use** - perfect for individual developers and researchers working with MLX models.
 
-**Current Version**: 1.0-rc1 (August 2025)
+**Current Version**: 1.0-rc2 (August 2025)
 
 [![GitHub Release](https://img.shields.io/github/v/release/mzau/mlx-knife)](https://github.com/mzau/mlx-knife/releases)
 [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
@@ -14,7 +14,7 @@ A lightweight, ollama-like CLI for managing and running MLX models on Apple Sili
 [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/)
 [![Apple Silicon](https://img.shields.io/badge/Apple%20Silicon-M1%2FM2%2FM3-green.svg)](https://support.apple.com/en-us/HT211814)
 [![MLX](https://img.shields.io/badge/MLX-Latest-orange.svg)](https://github.com/ml-explore/mlx)
-[![Tests](https://img.shields.io/badge/tests-86%2F86%20passing-brightgreen.svg)](#testing)
+[![Tests](https://img.shields.io/badge/tests-96%2F96%20passing-brightgreen.svg)](#testing)
 
 ## Features
 
@@ -43,7 +43,7 @@ A lightweight, ollama-like CLI for managing and running MLX models on Apple Sili
 - **Memory Insights**: See GPU memory usage after model loading and generation
 - **Dynamic Stop Tokens**: Automatic detection and filtering of model-specific stop tokens
 - **Customizable Generation**: Control temperature, max_tokens, top_p, and repetition penalty
-- **RAII Memory Management**: Context manager pattern ensures automatic cleanup and no memory leaks
+- **Context-Managed Memory**: Context manager pattern ensures automatic cleanup and prevents memory leaks
 - **Exception-Safe**: Robust error handling with guaranteed resource cleanup
 
 ## Installation
@@ -298,7 +298,7 @@ mlxk run bert-base-uncased
 
 ## Testing
 
-MLX Knife includes comprehensive test coverage with **86/86 tests passing** across all supported Python versions.
+MLX Knife includes comprehensive test coverage across all supported Python versions.
 
 ### Quick Start
 
@@ -346,7 +346,7 @@ Stop tokens are dynamically extracted from each model's tokenizer:
 - Common tokens verified as single-token entities
 
 ### Memory Management
-- **RAII Pattern**: Context manager ensures automatic resource cleanup
+- **Context Managers**: Automatic resource cleanup with Python context managers
 - **Exception-Safe**: Model cleanup guaranteed even on errors
 - **Baseline Tracking**: Memory captured before model loading
 - **Real-time Monitoring**: GPU memory tracking via `mlx.core.get_active_memory()`
@@ -416,5 +416,5 @@ Copyright (c) 2025 The BROKE team 🦫
 
 <p align="center">
   <b>Made with ❤️ by The BROKE team <img src="broke-logo.png" alt="BROKE Logo" width="30" style="vertical-align: middle;"></b><br>
-  <i>Version 1.0-rc1 | August 2025</i>
+  <i>Version 1.0-rc2 | August 2025</i>
 </p>
\ No newline at end of file
diff --git a/SECURITY.md b/SECURITY.md
index b65b749..6bc5390 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -98,7 +98,7 @@ mlxk server --host 0.0.0.0 --port 8000
 
 | Version | Supported          |
 | ------- | ------------------ |
-| 1.0-rc1 | :white_check_mark: |
+| 1.0-rc2 | :white_check_mark: |
 | < 1.0   | :x:                |
 
 ## Additional Resources
diff --git a/TESTING.md b/TESTING.md
index 17a19c2..1ef0da8 100644
--- a/TESTING.md
+++ b/TESTING.md
@@ -141,12 +141,12 @@ pytest --durations=10
 pytest -n auto
 ```
 
-## Test Results Summary (1.0-rc1)
+## Test Results Summary (1.0-rc2)
 
 ### ✅ Current Test Status (August 2025)
 
 ```
-Total Tests: 86/86 passing (100% ✅)
+Total Tests: 96/96 passing (100% ✅)
 ├── ✅ Integration Tests: 61 passing
 ├── ✅ Unit Tests: 25 passing  
 └── ✅ Real MLX Model Tests: All passing with Phi-3-mini
@@ -156,7 +156,7 @@ Total Tests: 86/86 passing (100% ✅)
 - ✅ **Complete test coverage** - All critical functionality validated
 - ✅ **Real model execution** - No mocked tests
 - ✅ **Process hygiene confirmed** - No zombie processes, clean shutdowns
-- ✅ **Memory management robust** - RAII pattern prevents leaks
+- ✅ **Memory management robust** - Context managers prevent leaks
 - ✅ **Exception safety verified** - Context managers work correctly
 
 ### Test Categories Breakdown
@@ -173,7 +173,7 @@ Total Tests: 86/86 passing (100% ✅)
 ## Python Version Compatibility
 
 ### Compatibility Status
-MLX Knife 1.0-rc1 is fully compatible with Python 3.9-3.13. Comprehensive verification completed with 86/86 tests passing on all supported versions.
+MLX Knife 1.0-rc2 is fully compatible with Python 3.9-3.13. Comprehensive verification completed with 96/96 tests passing on all supported versions.
 
 ### Manual Multi-Python Testing
 
@@ -195,11 +195,11 @@ deactivate && rm -rf test_39
 
 | Python Version | Status | Tests Passing |
 |----------------|--------|---------------|
-| 3.9.6 (macOS)  | ✅ Verified | 86/86 |
-| 3.10.x         | ✅ Verified | 86/86 |
-| 3.11.x         | ✅ Verified | 86/86 |
-| 3.12.x         | ✅ Verified | 86/86 |
-| 3.13.x         | ✅ Verified | 86/86 |
+| 3.9.6 (macOS)  | ✅ Verified | 96/96 |
+| 3.10.x         | ✅ Verified | 96/96 |
+| 3.11.x         | ✅ Verified | 96/96 |
+| 3.12.x         | ✅ Verified | 96/96 |
+| 3.13.x         | ✅ Verified | 96/96 |
 
 All versions tested with real MLX model execution (Phi-3-mini-4k-instruct-4bit).
 
@@ -339,20 +339,20 @@ When submitting PRs, please include:
    Platform: macOS 14.5, M2 Pro
    Python: 3.11.6
    Model: Phi-3-mini-4k-instruct-4bit
-   Results: 86/86 tests passed
+   Results: 96/96 tests passed
    ```
 
 3. **Any issues encountered** and how you resolved them
 
 ## Summary
 
-**MLX Knife 1.0-rc1 Testing Status:**
+**MLX Knife 1.0-rc2 Testing Status:**
 
-✅ **Production Ready** - 86/86 tests passing  
+✅ **Production Ready** - 96/96 tests passing  
 ✅ **Multi-Python Support** - Python 3.9-3.13 verified  
 ✅ **Code Quality** - ruff/mypy integration working  
 ✅ **Real Model Testing** - Phi-3-mini execution confirmed  
-✅ **Memory Management** - RAII pattern prevents leaks  
+✅ **Memory Management** - Context managers prevent leaks  
 ✅ **Exception Safety** - Context managers ensure cleanup  
 
 This comprehensive testing framework validates MLX Knife's **production readiness** through local testing on real Apple Silicon hardware with actual MLX models.
\ No newline at end of file
diff --git a/mlx_knife/__init__.py b/mlx_knife/__init__.py
index 6fb0c15..8339512 100644
--- a/mlx_knife/__init__.py
+++ b/mlx_knife/__init__.py
@@ -4,7 +4,7 @@ A lightweight, ollama-like CLI for managing and running MLX models on Apple Sili
 Provides native MLX execution with streaming output and interactive chat capabilities.
 """
 
-__version__ = "1.0-rc1"
+__version__ = "1.0-rc2"
 __author__ = "The BROKE team"
 __email__ = "broke@gmx.eu"
 __license__ = "MIT"
diff --git a/mlx_knife/mlx_runner.py b/mlx_knife/mlx_runner.py
index 030d2b4..80d5c71 100644
--- a/mlx_knife/mlx_runner.py
+++ b/mlx_knife/mlx_runner.py
@@ -34,14 +34,26 @@ class MLXRunner:
         self._stop_tokens = None  # Will be populated from tokenizer
         self.verbose = verbose
         self._model_loaded = False
+        self._context_entered = False  # Prevent nested context usage
 
     def __enter__(self):
         """Context manager entry - loads the model."""
-        self.load_model()
-        return self
+        if self._context_entered:
+            raise RuntimeError("MLXRunner context manager cannot be entered multiple times")
+        
+        self._context_entered = True
+        try:
+            self.load_model()
+            return self
+        except Exception:
+            # If load_model fails, ensure cleanup happens
+            self._context_entered = False
+            self.cleanup()
+            raise
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         """Context manager exit - cleans up the model."""
+        self._context_entered = False
         self.cleanup()
         return False  # Don't suppress exceptions
 
@@ -57,39 +69,54 @@ class MLXRunner:
         start_time = time.time()
 
         # Capture baseline memory before loading
-        mx.clear_cache()
+        try:
+            mx.clear_cache()
+        except Exception:
+            pass  # Continue even if cache clear fails
         self._memory_baseline = mx.get_active_memory() / 1024**3
 
-        # Load model and tokenizer
-        self.model, self.tokenizer = load(
-            str(self.model_path),
-            adapter_path=self.adapter_path
-        )
+        try:
+            # Load model and tokenizer
+            self.model, self.tokenizer = load(
+                str(self.model_path),
+                adapter_path=self.adapter_path
+            )
 
-        load_time = time.time() - start_time
-        current_memory = mx.get_active_memory() / 1024**3
-        model_memory = current_memory - self._memory_baseline
+            load_time = time.time() - start_time
+            current_memory = mx.get_active_memory() / 1024**3
+            model_memory = current_memory - self._memory_baseline
 
-        if self.verbose:
-            print(f"Model loaded in {load_time:.1f}s")
-            print(f"Memory: {model_memory:.1f}GB model, {current_memory:.1f}GB total")
+            if self.verbose:
+                print(f"Model loaded in {load_time:.1f}s")
+                print(f"Memory: {model_memory:.1f}GB model, {current_memory:.1f}GB total")
 
-        # Extract stop tokens from tokenizer
-        self._extract_stop_tokens()
-        self._model_loaded = True
+            # Extract stop tokens from tokenizer
+            self._extract_stop_tokens()
+            self._model_loaded = True
+            
+        except Exception as e:
+            # Ensure partial state is cleaned up on failure
+            self.model = None
+            self.tokenizer = None
+            self._stop_tokens = None
+            self._model_loaded = False
+            # Clear any memory that might have been allocated
+            mx.clear_cache()
+            raise RuntimeError(f"Failed to load model from {self.model_path}: {e}") from e
 
     def _extract_stop_tokens(self):
         """Extract stop tokens from the tokenizer dynamically."""
         self._stop_tokens = set()
 
         # Primary source: eos_token
-        if hasattr(self.tokenizer, 'eos_token') and self.tokenizer.eos_token:
-            self._stop_tokens.add(self.tokenizer.eos_token)
+        eos_token = getattr(self.tokenizer, 'eos_token', None)
+        if eos_token:
+            self._stop_tokens.add(eos_token)
 
         # Also check pad_token if it's different from eos_token
-        if hasattr(self.tokenizer, 'pad_token') and self.tokenizer.pad_token:
-            if self.tokenizer.pad_token != self.tokenizer.eos_token:
-                self._stop_tokens.add(self.tokenizer.pad_token)
+        pad_token = getattr(self.tokenizer, 'pad_token', None)
+        if pad_token and pad_token != eos_token:
+            self._stop_tokens.add(pad_token)
 
         # Check additional_special_tokens
         if hasattr(self.tokenizer, 'additional_special_tokens'):
@@ -125,17 +152,15 @@ class MLXRunner:
             print(f"Stop tokens: {self._stop_tokens}")
 
     def cleanup(self):
-        """Clean up model resources and clear GPU memory."""
-        if not self._model_loaded:
-            if self.verbose:
-                print("No model to cleanup")
-            return
-
-        if self.verbose:
+        """Clean up model resources and clear GPU memory.
+        
+        This method is safe to call multiple times and handles partial state cleanup.
+        """
+        if self.verbose and self._model_loaded:
             memory_before = mx.get_active_memory() / 1024**3
             print(f"Cleaning up model (memory before: {memory_before:.1f}GB)...")
 
-        # Clear model references
+        # Always clean up, even if model wasn't fully loaded
         self.model = None
         self.tokenizer = None
         self._stop_tokens = None
@@ -144,12 +169,18 @@ class MLXRunner:
         # Force garbage collection and clear MLX cache
         import gc
         gc.collect()
-        mx.clear_cache()
+        try:
+            mx.clear_cache()
+        except Exception:
+            pass  # Continue cleanup even if cache clear fails
 
         if self.verbose:
             memory_after = mx.get_active_memory() / 1024**3
-            memory_freed = memory_before - memory_after
-            print(f"Cleanup complete (memory after: {memory_after:.1f}GB, freed: {memory_freed:.1f}GB)")
+            if 'memory_before' in locals():
+                memory_freed = memory_before - memory_after
+                print(f"Cleanup complete (memory after: {memory_after:.1f}GB, freed: {memory_freed:.1f}GB)")
+            else:
+                print(f"Cleanup complete (memory after: {memory_after:.1f}GB)")
 
     def generate_streaming(
         self,
@@ -472,8 +503,13 @@ class MLXRunner:
         Returns:
             Dictionary with memory statistics in GB
         """
-        current_memory = mx.get_active_memory() / 1024**3
-        peak_memory = mx.get_peak_memory() / 1024**3
+        try:
+            current_memory = mx.get_active_memory() / 1024**3
+            peak_memory = mx.get_peak_memory() / 1024**3
+        except Exception:
+            # Return zeros if memory stats unavailable
+            current_memory = 0.0
+            peak_memory = 0.0
 
         return {
             "current_gb": current_memory,
diff --git a/tests/unit/test_mlx_runner_memory.py b/tests/unit/test_mlx_runner_memory.py
new file mode 100644
index 0000000..aa9f722
--- /dev/null
+++ b/tests/unit/test_mlx_runner_memory.py
@@ -0,0 +1,295 @@
+"""
+Unit tests for MLXRunner memory management robustness.
+
+Tests context manager implementation, exception handling, 
+and cleanup guarantees without requiring actual MLX models.
+"""
+import unittest
+from unittest.mock import MagicMock, patch, PropertyMock
+import gc
+
+
+class TestMLXRunnerMemoryManagement(unittest.TestCase):
+    """Test MLXRunner memory management robustness."""
+    
+    @patch('mlx_knife.mlx_runner.mx')
+    @patch('mlx_knife.mlx_runner.load')
+    def test_context_manager_basic_flow(self, mock_load, mock_mx):
+        """Test basic context manager flow with successful execution."""
+        from mlx_knife.mlx_runner import MLXRunner
+        
+        # Setup mocks
+        mock_model = MagicMock()
+        mock_tokenizer = MagicMock()
+        mock_tokenizer.eos_token = '</s>'
+        mock_tokenizer.eos_token_id = 2
+        mock_load.return_value = (mock_model, mock_tokenizer)
+        mock_mx.get_active_memory.return_value = 1024 * 1024 * 1024  # 1GB
+        
+        # Test successful context manager usage
+        with MLXRunner("test_model", verbose=False) as runner:
+            self.assertIsNotNone(runner.model)
+            self.assertIsNotNone(runner.tokenizer)
+            self.assertTrue(runner._model_loaded)
+            self.assertTrue(runner._context_entered)
+        
+        # After exiting context, model should be cleaned up
+        self.assertIsNone(runner.model)
+        self.assertIsNone(runner.tokenizer)
+        self.assertFalse(runner._model_loaded)
+        self.assertFalse(runner._context_entered)
+        
+        # Verify cleanup was called
+        mock_mx.clear_cache.assert_called()
+    
+    @patch('mlx_knife.mlx_runner.mx')
+    @patch('mlx_knife.mlx_runner.load')
+    def test_context_manager_exception_in_load(self, mock_load, mock_mx):
+        """Test cleanup when exception occurs during model loading."""
+        from mlx_knife.mlx_runner import MLXRunner
+        
+        # Setup mock to fail during load
+        mock_load.side_effect = RuntimeError("Model loading failed")
+        mock_mx.get_active_memory.return_value = 1024 * 1024 * 1024
+        
+        # Test that exception is propagated and cleanup happens
+        with self.assertRaises(RuntimeError) as cm:
+            with MLXRunner("test_model", verbose=False) as runner:
+                pass  # Should never reach here
+        
+        self.assertIn("Failed to load model", str(cm.exception))
+        
+        # Verify cleanup was called even on failure
+        mock_mx.clear_cache.assert_called()
+    
+    @patch('mlx_knife.mlx_runner.mx')
+    @patch('mlx_knife.mlx_runner.load')
+    def test_context_manager_exception_in_body(self, mock_load, mock_mx):
+        """Test cleanup when exception occurs in context body."""
+        from mlx_knife.mlx_runner import MLXRunner
+        
+        # Setup successful mocks
+        mock_model = MagicMock()
+        mock_tokenizer = MagicMock()
+        mock_tokenizer.eos_token = '</s>'
+        mock_tokenizer.eos_token_id = 2
+        mock_load.return_value = (mock_model, mock_tokenizer)
+        mock_mx.get_active_memory.return_value = 1024 * 1024 * 1024
+        
+        # Test exception in context body
+        with self.assertRaises(ValueError):
+            with MLXRunner("test_model", verbose=False) as runner:
+                self.assertTrue(runner._model_loaded)
+                raise ValueError("User error")
+        
+        # Cleanup should still happen
+        self.assertIsNone(runner.model)
+        self.assertIsNone(runner.tokenizer)
+        self.assertFalse(runner._model_loaded)
+        mock_mx.clear_cache.assert_called()
+    
+    @patch('mlx_knife.mlx_runner.mx')
+    @patch('mlx_knife.mlx_runner.load')
+    def test_prevent_nested_context_usage(self, mock_load, mock_mx):
+        """Test that nested context manager usage is prevented."""
+        from mlx_knife.mlx_runner import MLXRunner
+        
+        # Setup mocks
+        mock_model = MagicMock()
+        mock_tokenizer = MagicMock()
+        mock_tokenizer.eos_token = '</s>'
+        mock_tokenizer.eos_token_id = 2
+        mock_load.return_value = (mock_model, mock_tokenizer)
+        mock_mx.get_active_memory.return_value = 1024 * 1024 * 1024
+        
+        runner = MLXRunner("test_model", verbose=False)
+        
+        # First context should work
+        with runner:
+            self.assertTrue(runner._context_entered)
+            
+            # Nested context should fail
+            with self.assertRaises(RuntimeError) as cm:
+                with runner:
+                    pass
+            
+            self.assertIn("cannot be entered multiple times", str(cm.exception))
+        
+        # After exiting, should be able to use again
+        self.assertFalse(runner._context_entered)
+        
+        # Second usage should work
+        with runner:
+            self.assertTrue(runner._context_entered)
+    
+    @patch('mlx_knife.mlx_runner.mx')
+    @patch('mlx_knife.mlx_runner.load')
+    def test_partial_loading_failure_cleanup(self, mock_load, mock_mx):
+        """Test cleanup when loading partially succeeds then fails."""
+        from mlx_knife.mlx_runner import MLXRunner
+        
+        # Setup mock to partially succeed
+        mock_model = MagicMock()
+        mock_tokenizer = MagicMock()
+        
+        # Missing required attributes to trigger failure in _extract_stop_tokens
+        del mock_tokenizer.eos_token
+        del mock_tokenizer.eos_token_id
+        mock_tokenizer.encode.side_effect = Exception("Tokenizer error")
+        
+        mock_load.return_value = (mock_model, mock_tokenizer)
+        mock_mx.get_active_memory.return_value = 1024 * 1024 * 1024
+        
+        runner = MLXRunner("test_model", verbose=False)
+        
+        # Load should succeed even with tokenizer issues
+        try:
+            runner.load_model()
+            # Model should be loaded even if stop token extraction had issues
+            self.assertIsNotNone(runner.model)
+            self.assertIsNotNone(runner.tokenizer)
+        finally:
+            # Cleanup should work regardless
+            runner.cleanup()
+            self.assertIsNone(runner.model)
+            self.assertIsNone(runner.tokenizer)
+            mock_mx.clear_cache.assert_called()
+    
+    @patch('mlx_knife.mlx_runner.mx')
+    def test_cleanup_idempotency(self, mock_mx):
+        """Test that cleanup can be called multiple times safely."""
+        from mlx_knife.mlx_runner import MLXRunner
+        
+        mock_mx.get_active_memory.return_value = 1024 * 1024 * 1024
+        
+        runner = MLXRunner("test_model", verbose=False)
+        runner.model = MagicMock()
+        runner.tokenizer = MagicMock()
+        runner._model_loaded = True
+        
+        # Call cleanup multiple times
+        for _ in range(3):
+            runner.cleanup()
+            self.assertIsNone(runner.model)
+            self.assertIsNone(runner.tokenizer)
+            self.assertFalse(runner._model_loaded)
+        
+        # Should have been called at least once
+        mock_mx.clear_cache.assert_called()
+    
+    @patch('mlx_knife.mlx_runner.mx')
+    @patch('mlx_knife.mlx_runner.load')
+    def test_memory_baseline_tracking(self, mock_load, mock_mx):
+        """Test memory baseline is properly tracked."""
+        from mlx_knife.mlx_runner import MLXRunner
+        
+        # Setup mocks
+        mock_model = MagicMock()
+        mock_tokenizer = MagicMock()
+        mock_tokenizer.eos_token = '</s>'
+        mock_tokenizer.eos_token_id = 2
+        mock_load.return_value = (mock_model, mock_tokenizer)
+        
+        # Simulate memory growth during loading
+        memory_values = [
+            1 * 1024**3,  # 1GB baseline
+            5 * 1024**3,  # 5GB after loading
+            5 * 1024**3,  # 5GB when querying stats
+        ]
+        mock_mx.get_active_memory.side_effect = memory_values
+        
+        runner = MLXRunner("test_model", verbose=False)
+        runner.load_model()
+        
+        # Check baseline was captured
+        self.assertEqual(runner._memory_baseline, 1.0)  # 1GB
+        
+        # Check memory usage calculation
+        memory_stats = runner.get_memory_usage()
+        self.assertEqual(memory_stats["model_gb"], 4.0)  # 5GB - 1GB = 4GB
+    
+    @patch('mlx_knife.mlx_runner.mx')
+    @patch('mlx_knife.mlx_runner.load')
+    def test_generate_without_loading(self, mock_load, mock_mx):
+        """Test that generate methods fail gracefully without loaded model."""
+        from mlx_knife.mlx_runner import MLXRunner
+        
+        runner = MLXRunner("test_model", verbose=False)
+        
+        # Try to generate without loading
+        with self.assertRaises(RuntimeError) as cm:
+            list(runner.generate_streaming("test prompt"))
+        self.assertIn("Model not loaded", str(cm.exception))
+        
+        with self.assertRaises(RuntimeError) as cm:
+            runner.generate_batch("test prompt")
+        self.assertIn("Model not loaded", str(cm.exception))
+    
+    @patch('mlx_knife.mlx_runner.mx')
+    @patch('mlx_knife.mlx_runner.load')
+    def test_server_usage_without_context_manager(self, mock_load, mock_mx):
+        """Test server-style usage without context manager."""
+        from mlx_knife.mlx_runner import MLXRunner
+        
+        # Setup mocks
+        mock_model = MagicMock()
+        mock_tokenizer = MagicMock()
+        mock_tokenizer.eos_token = '</s>'
+        mock_tokenizer.eos_token_id = 2
+        mock_load.return_value = (mock_model, mock_tokenizer)
+        mock_mx.get_active_memory.return_value = 1024 * 1024 * 1024
+        
+        # Server style: manual load and cleanup
+        runner = MLXRunner("test_model", verbose=False)
+        
+        try:
+            runner.load_model()
+            self.assertTrue(runner._model_loaded)
+            self.assertIsNotNone(runner.model)
+            
+            # Simulate server keeping model loaded
+            # and potentially switching models
+            runner.cleanup()
+            self.assertFalse(runner._model_loaded)
+            self.assertIsNone(runner.model)
+            
+            # Load again (simulating model switch)
+            runner.load_model()
+            self.assertTrue(runner._model_loaded)
+            
+        finally:
+            # Ensure cleanup happens
+            runner.cleanup()
+            self.assertFalse(runner._model_loaded)
+    
+    @patch('mlx_knife.mlx_runner.mx')
+    @patch('mlx_knife.mlx_runner.load')
+    def test_exception_during_cleanup(self, mock_load, mock_mx):
+        """Test that cleanup handles exceptions gracefully."""
+        from mlx_knife.mlx_runner import MLXRunner
+        
+        # Setup mocks
+        mock_model = MagicMock()
+        mock_tokenizer = MagicMock()
+        mock_tokenizer.eos_token = '</s>'
+        mock_tokenizer.eos_token_id = 2
+        mock_load.return_value = (mock_model, mock_tokenizer)
+        mock_mx.get_active_memory.return_value = 1024 * 1024 * 1024
+        
+        # Make clear_cache raise an exception
+        mock_mx.clear_cache.side_effect = Exception("Cache clear failed")
+        
+        runner = MLXRunner("test_model", verbose=False)
+        runner.load_model()
+        
+        # Cleanup should complete even if mx.clear_cache fails
+        runner.cleanup()  # Should not raise
+        
+        # State should still be cleaned
+        self.assertIsNone(runner.model)
+        self.assertIsNone(runner.tokenizer)
+        self.assertFalse(runner._model_loaded)
+
+
+if __name__ == '__main__':
+    unittest.main()
\ No newline at end of file