Guardrails & Safety Skill

Protecting AI applications from misuse.

Input Guardrails

class InputGuard:
    def __init__(self):
        self.toxicity = load_toxicity_model()
        self.pii = PIIDetector()
        self.injection = InjectionDetector()

    def check(self, text):
        result = {"allowed": True, "issues": []}

        # Toxicity
        if self.toxicity.predict(text) > 0.7:
            result["allowed"] = False
            result["issues"].append("toxic")

        # PII
        pii = self.pii.detect(text)
        if pii:
            result["issues"].append(f"pii: {pii}")
            text = self.pii.redact(text)

        # Injection
        if self.injection.detect(text):
            result["allowed"] = False
            result["issues"].append("injection")

        result["sanitized"] = text
        return result

Output Guardrails

class OutputGuard:
    def check(self, output, context=None):
        result = {"allowed": True, "issues": []}

        # Factuality
        if context:
            if self.fact_checker.check(output, context) < 0.7:
                result["issues"].append("hallucination")

        # Toxicity
        if self.toxicity.predict(output) > 0.5:
            result["allowed"] = False
            result["issues"].append("toxic")

        # Citations
        invalid = self.citation_validator.check(output)
        if invalid:
            result["issues"].append(f"bad_citations: {len(invalid)}")

        return result

Injection Detection

class InjectionDetector:
    PATTERNS = [
        r"ignore (previous|all) instructions",
        r"forget (your|all) (instructions|rules)",
        r"you are now",
        r"new persona",
        r"act as",
        r"pretend to be",
        r"disregard",
    ]

    def detect(self, text):
        text_lower = text.lower()
        for pattern in self.PATTERNS:
            if re.search(pattern, text_lower):
                return True
        return False

Constitutional AI

class ConstitutionalFilter:
    def __init__(self, principles):
        self.principles = principles
        self.critic = load_model("critic")
        self.reviser = load_model("reviser")

    def filter(self, response):
        for principle in self.principles:
            critique = self.critic.generate(f"""
            Does this violate: "{principle}"?
            Response: {response}
            """)

            if "violates" in critique.lower():
                response = self.reviser.generate(f"""
                Rewrite to comply with: "{principle}"
                Original: {response}
                Critique: {critique}
                """)

        return response

PRINCIPLES = [
    "Do not provide harmful instructions",
    "Do not reveal personal information",
    "Acknowledge uncertainty",
    "Do not fabricate facts",
]

PII Protection

class PIIDetector:
    PATTERNS = {
        "email": r"\b[\w.-]+@[\w.-]+\.\w+\b",
        "phone": r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b",
        "ssn": r"\b\d{3}-\d{2}-\d{4}\b",
        "credit_card": r"\b\d{4}[-\s]?\d{4}[-\s]?\d{4}[-\s]?\d{4}\b",
    }

    def detect(self, text):
        found = {}
        for name, pattern in self.PATTERNS.items():
            matches = re.findall(pattern, text)
            if matches:
                found[name] = matches
        return found

    def redact(self, text):
        for name, pattern in self.PATTERNS.items():
            text = re.sub(pattern, f"[{name.upper()}]", text)
        return text

Best Practices

Defense in depth (multiple layers)
Log all blocked content
Regular adversarial testing
Update patterns continuously
Fail closed (block if uncertain)

guardrails-safety

Guardrails & Safety Skill

Input Guardrails

Output Guardrails

Injection Detection

Constitutional AI

PII Protection

Best Practices