Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions docs/my-website/docs/proxy/guardrails/lakera_ai.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,13 @@ guardrails:
mode: "pre_call"
api_key: os.environ/LAKERA_API_KEY
api_base: os.environ/LAKERA_API_BASE
- guardrail_name: "lakera-monitor"
litellm_params:
guardrail: lakera_v2
mode: "pre_call"
on_flagged: "monitor" # Log violations but don't block
api_key: os.environ/LAKERA_API_KEY
api_base: os.environ/LAKERA_API_BASE

```

Expand Down Expand Up @@ -144,6 +151,7 @@ guardrails:
# breakdown: Optional[bool] = True,
# metadata: Optional[Dict] = None,
# dev_info: Optional[bool] = True,
# on_flagged: Optional[str] = "block", # "block" or "monitor"
```

- `api_base`: (Optional[str]) The base of the Lakera integration. Defaults to `https://api.lakera.ai`
Expand All @@ -153,3 +161,6 @@ guardrails:
- `breakdown`: (Optional[bool]) When true the response will return a breakdown list of the detectors that were run, as defined in the policy, and whether each of them detected something or not.
- `metadata`: (Optional[Dict]) Metadata tags can be attached to screening requests as an object that can contain any arbitrary key-value pairs.
- `dev_info`: (Optional[bool]) When true the response will return an object with developer information about the build of Lakera Guard.
- `on_flagged`: (Optional[str]) Action to take when content is flagged. Defaults to `"block"`.
- `"block"`: Raises an HTTP 400 exception when violations are detected (default behavior)
- `"monitor"`: Logs violations but allows the request to proceed. Useful for tuning security policies without blocking legitimate requests.
33 changes: 25 additions & 8 deletions litellm/proxy/guardrails/guardrail_hooks/lakera_ai_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def __init__(
breakdown: Optional[bool] = True,
metadata: Optional[Dict] = None,
dev_info: Optional[bool] = True,
on_flagged: Optional[str] = "block",
**kwargs,
):
"""
Expand All @@ -48,6 +49,7 @@ def __init__(
breakdown: Optional[bool] = True,
metadata: Optional[Dict] = None,
dev_info: Optional[bool] = True,
on_flagged: Optional[str] = "block", Action to take when content is flagged: "block" or "monitor"
"""
self.async_handler = get_async_httpx_client(
llm_provider=httpxSpecialProvider.GuardrailCallback
Expand All @@ -61,6 +63,7 @@ def __init__(
self.breakdown: Optional[bool] = breakdown
self.metadata: Optional[Dict] = metadata
self.dev_info: Optional[bool] = dev_info
self.on_flagged = on_flagged or "block"
super().__init__(**kwargs)

async def call_v2_guard(
Expand Down Expand Up @@ -228,10 +231,17 @@ async def async_pre_call_hook(
"Lakera AI: Masked PII in messages instead of blocking request"
)
else:
# If there are other violations or not set to mask PII, raise exception
raise self._get_http_exception_for_blocked_guardrail(
lakera_guardrail_response
)
# Check on_flagged setting
if self.on_flagged == "monitor":
verbose_proxy_logger.warning(
"Lakera Guardrail: Monitoring mode - violation detected but allowing request"
)
# Log violation but continue
elif self.on_flagged == "block":
# If there are other violations or not set to mask PII, raise exception
raise self._get_http_exception_for_blocked_guardrail(
lakera_guardrail_response
)

#########################################################
########## 3. Add the guardrail to the applied guardrails header ##########
Expand Down Expand Up @@ -286,10 +296,17 @@ async def async_moderation_hook(
"Lakera AI: Masked PII in messages instead of blocking request"
)
else:
# If there are other violations or not set to mask PII, raise exception
raise self._get_http_exception_for_blocked_guardrail(
lakera_guardrail_response
)
# Check on_flagged setting
if self.on_flagged == "monitor":
verbose_proxy_logger.warning(
"Lakera Guardrail: Monitoring mode - violation detected but allowing request"
)
# Log violation but continue
elif self.on_flagged == "block":
# If there are other violations or not set to mask PII, raise exception
raise self._get_http_exception_for_blocked_guardrail(
lakera_guardrail_response
)

#########################################################
########## 3. Add the guardrail to the applied guardrails header ##########
Expand Down
1 change: 1 addition & 0 deletions litellm/proxy/guardrails/guardrail_initializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ def initialize_lakera_v2(litellm_params: LitellmParams, guardrail: Guardrail):
breakdown=litellm_params.breakdown,
metadata=litellm_params.metadata,
dev_info=litellm_params.dev_info,
on_flagged=litellm_params.on_flagged,
)
litellm.logging_callback_manager.add_litellm_callback(_lakera_v2_callback)
return _lakera_v2_callback
Expand Down
4 changes: 4 additions & 0 deletions litellm/types/guardrails.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,10 @@ class LakeraV2GuardrailConfigModel(BaseModel):
default=True,
description="Whether to include developer information in the response",
)
on_flagged: Optional[Literal["block", "monitor"]] = Field(
default="block",
description="Action to take when content is flagged: 'block' (raise exception) or 'monitor' (log only)",
)


class LassoGuardrailConfigModel(BaseModel):
Expand Down
129 changes: 129 additions & 0 deletions tests/guardrails_tests/test_lakera_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,3 +231,132 @@ async def test_lakera_blocks_flagged_content_with_user_scenario():
assert lakera_response["metadata"]["request_uuid"] == "b7cd4c8a-28aa-4285-a245-2befee514dbf"
assert len(lakera_response["breakdown"]) == 16 # All the breakdown items from the user's scenario


@pytest.mark.asyncio
async def test_lakera_monitor_mode_allows_flagged_content():
"""Test that monitor mode logs violations but allows requests to proceed."""

lakera_guardrail = LakeraAIGuardrail(
api_key="test_key",
on_flagged="monitor", # Monitor mode
)

# Mock response with violations
mock_response = {
'payload': [],
'flagged': True,
'breakdown': [
{'detector_type': 'moderated_content/violence', 'detected': True, 'message_id': 0},
{'detector_type': 'prompt_attack', 'detected': True, 'message_id': 0},
]
}

with patch.object(lakera_guardrail, 'call_v2_guard', new_callable=AsyncMock) as mock_call:
mock_call.return_value = (mock_response, {})

data = {
"messages": [
{"role": "user", "content": "Some harmful content"}
],
"model": "gpt-3.5-turbo",
"metadata": {}
}

user_api_key_dict = UserAPIKeyAuth(api_key="test_key")
cache = DualCache()

# Should NOT raise an exception in monitor mode
result = await lakera_guardrail.async_pre_call_hook(
user_api_key_dict=user_api_key_dict,
cache=cache,
data=data,
call_type="completion"
)

# Verify request was allowed through
assert result is not None
assert "messages" in result


@pytest.mark.asyncio
async def test_lakera_block_mode_raises_exception():
"""Test that block mode (default) raises HTTPException for violations."""

lakera_guardrail = LakeraAIGuardrail(
api_key="test_key",
on_flagged="block", # Block mode (default)
)

mock_response = {
'payload': [],
'flagged': True,
'breakdown': [
{'detector_type': 'moderated_content/violence', 'detected': True, 'message_id': 0},
]
}

with patch.object(lakera_guardrail, 'call_v2_guard', new_callable=AsyncMock) as mock_call:
mock_call.return_value = (mock_response, {})

data = {
"messages": [
{"role": "user", "content": "Harmful content"}
],
"model": "gpt-3.5-turbo",
"metadata": {}
}

user_api_key_dict = UserAPIKeyAuth(api_key="test_key")
cache = DualCache()

# Should raise HTTPException in block mode
with pytest.raises(HTTPException) as exc_info:
await lakera_guardrail.async_pre_call_hook(
user_api_key_dict=user_api_key_dict,
cache=cache,
data=data,
call_type="completion"
)

assert exc_info.value.status_code == 400


@pytest.mark.asyncio
async def test_lakera_monitor_mode_during_call():
"""Test monitor mode works with during_call (moderation_hook)."""

lakera_guardrail = LakeraAIGuardrail(
api_key="test_key",
on_flagged="monitor",
)

mock_response = {
'payload': [],
'flagged': True,
'breakdown': [
{'detector_type': 'prompt_attack', 'detected': True, 'message_id': 0},
]
}

with patch.object(lakera_guardrail, 'call_v2_guard', new_callable=AsyncMock) as mock_call:
mock_call.return_value = (mock_response, {})

data = {
"messages": [
{"role": "user", "content": "Test content"}
],
"model": "gpt-3.5-turbo",
"metadata": {}
}

user_api_key_dict = UserAPIKeyAuth(api_key="test_key")

# Should NOT raise exception in monitor mode
result = await lakera_guardrail.async_moderation_hook(
data=data,
user_api_key_dict=user_api_key_dict,
call_type="completion"
)

assert result is not None

Loading