Testing Semantic Kernel Applications: Plugins, Memory, and Planners
Semantic Kernel (SK) applications combine LLM calls, plugin functions, memory retrieval, and multi-step planning into a single execution graph. Testing each layer in isolation — without making real API calls — is the difference between a test suite that runs in 2 seconds and one that costs $50 per CI run.
Project Structure for Testability
Separate concerns from the start. If your business logic is tangled with kernel construction, it's hard to mock.
src/
plugins/
email_plugin.py
calendar_plugin.py
agents/
scheduling_agent.py
kernel_factory.py # builds and configures the kernel
tests/
unit/
test_email_plugin.py
test_scheduling_agent.py
integration/
test_planner_e2e.pyTesting Plugins in Isolation
Plugins are regular Python classes decorated with @kernel_function. Since they're just functions, test them without a kernel:
# src/plugins/email_plugin.py
from semantic_kernel.functions import kernel_function
from semantic_kernel.functions.kernel_function_decorator import kernel_function
class EmailPlugin:
def __init__(self, email_client):
self._client = email_client
@kernel_function(name="send_email", description="Send an email to a recipient")
async def send_email(self, recipient: str, subject: str, body: str) -> str:
result = await self._client.send(
to=recipient,
subject=subject,
body=body,
)
return f"Email sent to {recipient} (id: {result.message_id})"
@kernel_function(name="get_inbox", description="Get recent inbox messages")
async def get_inbox(self, limit: int = 10) -> str:
messages = await self._client.fetch_inbox(limit=limit)
return "\n".join(f"- {m.subject} from {m.sender}" for m in messages)# tests/unit/test_email_plugin.py
import pytest
from unittest.mock import AsyncMock, MagicMock
from src.plugins.email_plugin import EmailPlugin
@pytest.fixture
def mock_email_client():
client = AsyncMock()
client.send.return_value = MagicMock(message_id="msg-123")
client.fetch_inbox.return_value = [
MagicMock(subject="Meeting tomorrow", sender="boss@company.com"),
MagicMock(subject="Invoice #442", sender="billing@vendor.com"),
]
return client
@pytest.fixture
def email_plugin(mock_email_client):
return EmailPlugin(email_client=mock_email_client)
@pytest.mark.asyncio
async def test_send_email_returns_confirmation(email_plugin, mock_email_client):
result = await email_plugin.send_email(
recipient="alice@example.com",
subject="Hello",
body="Test message",
)
assert "alice@example.com" in result
assert "msg-123" in result
mock_email_client.send.assert_called_once_with(
to="alice@example.com",
subject="Hello",
body="Test message",
)
@pytest.mark.asyncio
async def test_get_inbox_formats_messages(email_plugin):
result = await email_plugin.get_inbox(limit=5)
assert "Meeting tomorrow" in result
assert "boss@company.com" in result
assert "Invoice #442" in resultMocking the Kernel and Chat Completion Service
When testing code that calls kernel.invoke() or uses the chat service directly, mock the AI layer:
# src/agents/scheduling_agent.py
from semantic_kernel import Kernel
from semantic_kernel.connectors.ai.open_ai import OpenAIChatCompletion
from semantic_kernel.contents import ChatHistory
class SchedulingAgent:
def __init__(self, kernel: Kernel):
self._kernel = kernel
async def suggest_meeting_times(self, context: str) -> str:
chat_history = ChatHistory()
chat_history.add_system_message(
"You are a scheduling assistant. Suggest 3 meeting times based on context."
)
chat_history.add_user_message(context)
service = self._kernel.get_service(type=OpenAIChatCompletion)
response = await service.get_chat_message_content(
chat_history=chat_history,
settings=self._kernel.get_prompt_execution_settings_from_service_id("default"),
)
return str(response)# tests/unit/test_scheduling_agent.py
import pytest
from unittest.mock import AsyncMock, MagicMock, patch
from semantic_kernel import Kernel
from src.agents.scheduling_agent import SchedulingAgent
@pytest.fixture
def mock_kernel():
kernel = MagicMock(spec=Kernel)
mock_service = AsyncMock()
mock_response = MagicMock()
mock_response.__str__ = lambda self: "Monday 2pm, Tuesday 10am, Wednesday 3pm"
mock_service.get_chat_message_content.return_value = mock_response
kernel.get_service.return_value = mock_service
kernel.get_prompt_execution_settings_from_service_id.return_value = MagicMock()
return kernel
@pytest.mark.asyncio
async def test_suggest_meeting_times_returns_suggestions(mock_kernel):
agent = SchedulingAgent(kernel=mock_kernel)
result = await agent.suggest_meeting_times(
"Alice is free Mon-Wed afternoons. Bob is free all day Tuesday."
)
assert "Monday" in result or "Tuesday" in result or "Wednesday" in result
mock_kernel.get_service.assert_called_once()Testing Memory Stores
SK's memory abstraction lets you swap implementations. In tests, use the in-memory store:
# tests/unit/test_memory_retrieval.py
import pytest
from semantic_kernel.memory import SemanticTextMemory
from semantic_kernel.connectors.memory.in_memory import InMemoryCollection
from semantic_kernel.connectors.ai.embeddings import EmbeddingGeneratorBase
from unittest.mock import AsyncMock
import numpy as np
@pytest.fixture
def mock_embedding_service():
service = AsyncMock(spec=EmbeddingGeneratorBase)
# Return consistent fake embeddings for test inputs
def fake_embed(texts, **kwargs):
# Simple hash-based fake embeddings for determinism
return [
np.array([hash(t) % 100 / 100.0] * 1536, dtype=np.float32)
for t in texts
]
service.generate_raw_embeddings.side_effect = fake_embed
return service
@pytest.mark.asyncio
async def test_memory_search_returns_relevant_records(mock_embedding_service):
memory = SemanticTextMemory(
storage=InMemoryCollection(),
embeddings_generator=mock_embedding_service,
)
# Populate memory
await memory.save_information(
collection="docs",
id="doc1",
text="Our refund policy allows returns within 30 days.",
)
await memory.save_information(
collection="docs",
id="doc2",
text="Shipping takes 3-5 business days.",
)
# Search
results = await memory.search("docs", "Can I return a product?", limit=1)
assert len(results) >= 1
assert results[0].id in ("doc1", "doc2")Testing Planners
Planners are the hardest SK component to test because they call the LLM to generate a plan. Mock the completion service to return a deterministic plan:
# tests/unit/test_planner.py
import pytest
import json
from unittest.mock import AsyncMock, MagicMock, patch
from semantic_kernel import Kernel
from semantic_kernel.planners import FunctionCallingStepwisePlanner
from semantic_kernel.contents import ChatMessageContent
MOCK_PLAN_RESPONSE = json.dumps({
"plan": {
"steps": [
{
"skill": "EmailPlugin",
"function": "get_inbox",
"parameters": {"limit": "5"},
},
{
"skill": "CalendarPlugin",
"function": "get_availability",
"parameters": {"date": "tomorrow"},
},
]
}
})
@pytest.fixture
def kernel_with_mock_llm():
kernel = Kernel()
# Mock the chat completion service
mock_service = AsyncMock()
mock_response = MagicMock(spec=ChatMessageContent)
mock_response.content = MOCK_PLAN_RESPONSE
mock_service.get_chat_message_content.return_value = mock_response
with patch.object(kernel, "get_service", return_value=mock_service):
yield kernel, mock_service
@pytest.mark.asyncio
async def test_planner_generates_multi_step_plan(kernel_with_mock_llm):
kernel, mock_service = kernel_with_mock_llm
planner = FunctionCallingStepwisePlanner(service_id="default")
# Verify planner calls the LLM to generate a plan
with patch.object(planner, "_generate_plan", return_value=MOCK_PLAN_RESPONSE):
plan_text = await planner._generate_plan(kernel, "Check my inbox and schedule a meeting")
plan = json.loads(plan_text)
steps = plan["plan"]["steps"]
assert len(steps) == 2
assert steps[0]["skill"] == "EmailPlugin"
assert steps[1]["skill"] == "CalendarPlugin"Integration Tests with Real API
Keep integration tests separate and gate them behind an environment variable:
# tests/integration/test_planner_e2e.py
import os
import pytest
from semantic_kernel import Kernel
from semantic_kernel.connectors.ai.open_ai import OpenAIChatCompletion
from semantic_kernel.planners import FunctionCallingStepwisePlanner
from src.plugins.email_plugin import EmailPlugin
pytestmark = pytest.mark.skipif(
not os.environ.get("RUN_INTEGRATION_TESTS"),
reason="Integration tests require RUN_INTEGRATION_TESTS=1 and real API key",
)
@pytest.fixture
def real_kernel():
kernel = Kernel()
kernel.add_service(
OpenAIChatCompletion(
service_id="default",
ai_model_id="gpt-4o-mini",
api_key=os.environ["OPENAI_API_KEY"],
)
)
# Use a real email client in integration mode
kernel.add_plugin(EmailPlugin(email_client=RealEmailClient()), "EmailPlugin")
return kernel
@pytest.mark.asyncio
async def test_email_scheduling_plan_executes(real_kernel):
planner = FunctionCallingStepwisePlanner(service_id="default")
result = await planner.invoke(real_kernel, "Get my last 3 emails and summarize them")
assert result.final_answer
assert len(result.final_answer) > 10CI Configuration
# .github/workflows/test-sk-app.yml
name: Semantic Kernel Tests
on: [push, pull_request]
jobs:
unit-tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.11"
- run: pip install semantic-kernel pytest pytest-asyncio
- run: pytest tests/unit/ -v
integration-tests:
runs-on: ubuntu-latest
if: github.ref == 'refs/heads/main'
env:
RUN_INTEGRATION_TESTS: "1"
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
steps:
- uses: actions/checkout@v4
- uses: actions/setup-python@v5
with:
python-version: "3.11"
- run: pip install semantic-kernel pytest pytest-asyncio
- run: pytest tests/integration/ -v --timeout=60Unit tests run on every PR with no API keys. Integration tests run only on main after merge, using real credentials. This keeps PRs fast and cheap while still verifying end-to-end behavior on the main branch.