comfyui-nvidia/deployments/ai-stack/openwebui-tools/smart_image_pipe.py

"""
title: Smart Image Studio (Pipe)
author: ai-stack
version: 0.1.2
description: Deterministic image-gen / edit / inpaint pipe — no LLM in the
    loop for the routing decision. Registers as a model in the chat-model
    dropdown ('Image Studio (Pipe)'). Reads the user's message + attached
    image (if any), routes via regex, calls ComfyUI directly, returns the
    image. Use when LLM-with-Tool tool-calling is leaking the call as text
    (the abliterated Qwen 3.5 / Open WebUI parser interop bug).
required_open_webui_version: 0.5.0
"""

import asyncio
import base64
import inspect
import io
import json
import re
import time
import uuid
from typing import Awaitable, Callable, Literal, Optional

import aiohttp
from pydantic import BaseModel, Field

# Open WebUI runtime imports — same defensive guard as the sibling Tool.
try:
    from fastapi import UploadFile
    from open_webui.models.chats import Chats
    from open_webui.models.files import Files
    from open_webui.models.users import Users
    from open_webui.routers.files import upload_file_handler

    _OPENWEBUI_RUNTIME = True
except ImportError:
    _OPENWEBUI_RUNTIME = False


# ─────────────────────────────────────────────────────────────────────────────
# Per-style settings — kept in sync with smart_image_gen.py. If you change
# checkpoint filenames in comfyui-init-models.sh, update both files.
# ─────────────────────────────────────────────────────────────────────────────

STYLES = {
    "photo": {
        "ckpt":      "CyberRealisticXLPlay_V8.0_FP16.safetensors",
        "sampler":   "dpmpp_2m_sde",
        "scheduler": "karras",
        "cfg":       4.0, "steps": 28, "clip_skip": 1,
        "prefix": "",
        "negative": (
            "cartoon, drawing, illustration, anime, manga, painting, sketch, "
            "render, 3d, cgi, plastic skin, oversaturated, "
            "lowres, blurry, jpeg artifacts, low quality, worst quality, "
            "bad anatomy, deformed, extra fingers, missing fingers, "
            "watermark, signature, text, logo"
        ),
    },
    "juggernaut": {
        "ckpt":      "Juggernaut-XL_v9_RunDiffusionPhoto_v2.safetensors",
        "sampler":   "dpmpp_2m_sde",
        "scheduler": "karras",
        "cfg":       4.5, "steps": 35, "clip_skip": 1,
        "prefix": "",
        "negative": (
            "cartoon, drawing, illustration, anime, painting, sketch, render, "
            "3d, cgi, plastic skin, washed out, "
            "lowres, blurry, jpeg artifacts, low quality, worst quality, "
            "bad anatomy, deformed, extra fingers, missing fingers, "
            "watermark, signature, text, logo"
        ),
    },
    "pony": {
        "ckpt":      "ponyDiffusionV6XL_v6StartWithThisOne.safetensors",
        "sampler":   "euler_ancestral",
        "scheduler": "normal",
        "cfg":       7.5, "steps": 25, "clip_skip": 2,
        "prefix":    "score_9, score_8_up, score_7_up, score_6_up, score_5_up, score_4_up, ",
        "negative": (
            "score_6, score_5, score_4, "
            "worst quality, low quality, lowres, blurry, jpeg artifacts, "
            "bad anatomy, bad hands, extra digit, fewer digits, "
            "deformed, ugly, censored, monochrome, "
            "watermark, signature, text, artist name"
        ),
    },
    "general": {
        "ckpt":      "talmendoxlSDXL_v11Beta.safetensors",
        "sampler":   "dpmpp_2m",
        "scheduler": "karras",
        "cfg":       8.0, "steps": 30, "clip_skip": 2,
        "prefix": "",
        "negative": (
            "lowres, blurry, jpeg artifacts, low quality, worst quality, "
            "bad anatomy, deformed, ugly, watermark, signature, text"
        ),
    },
    "furry-nai": {
        "ckpt":      "reedFURRYMixSDXL_v23nai.safetensors",
        "sampler":   "euler_ancestral",
        "scheduler": "normal",
        "cfg":       5.0, "steps": 30, "clip_skip": 2,
        "prefix": (
            "masterpiece, best quality, high quality, detailed eyes, "
            "highres, absurdres, furry, "
        ),
        "negative": (
            "human, realistic, photorealistic, 3d, cgi, "
            "worst quality, low quality, lowres, blurry, jpeg artifacts, "
            "bad anatomy, extra digit, fewer digits, deformed, ugly, "
            "watermark, signature, text"
        ),
    },
    "furry-noob": {
        "ckpt":      "indigoVoidFurryFusedXL_noobaiV32.safetensors",
        "sampler":   "euler_ancestral",
        "scheduler": "normal",
        "cfg":       4.5, "steps": 20, "clip_skip": 2,
        "prefix": (
            "masterpiece, best quality, perfect quality, absurdres, newest, "
            "very aesthetic, vibrant colors, "
        ),
        "negative": (
            "human, realistic, photorealistic, 3d, cgi, shiny skin, "
            "worst quality, low quality, lowres, blurry, jpeg artifacts, "
            "bad anatomy, bad hands, mutated hands, "
            "watermark, signature, text"
        ),
    },
    "furry-il": {
        "ckpt":      "novaFurryXL_ilV170.safetensors",
        "sampler":   "euler_ancestral",
        "scheduler": "normal",
        "cfg":       4.0, "steps": 30, "clip_skip": 2,
        "prefix": (
            "masterpiece, best quality, amazing quality, very aesthetic, "
            "ultra-detailed, absurdres, newest, furry, anthro, "
        ),
        "negative": (
            "human, multiple tails, modern, recent, old, oldest, graphic, "
            "cartoon, painting, deformed, mutated, ugly, lowres, "
            "bad anatomy, bad hands, missing fingers, extra digits, "
            "worst quality, bad quality, sketch, jpeg artifacts, "
            "signature, watermark, text, simple background"
        ),
    },
}

DEFAULT_STYLE = "furry-il"

ROUTING_RULES = [
    (re.compile(r"\bscore_\d", re.I),                                       "pony"),
    (re.compile(r"\bpony\b",   re.I),                                       "pony"),
    (re.compile(r"\b(noobai|noob)\b", re.I),                                "furry-noob"),
    (re.compile(r"\b(illustrious|ilxl)\b", re.I),                           "furry-il"),
    (re.compile(r"\b(furry|anthro|feral|kemono|fursona|species)\b", re.I),  "furry-il"),
    (re.compile(r"\b(juggernaut)\b", re.I),                                 "juggernaut"),
    (re.compile(r"\b(photo|photograph|realistic|portrait|selfie|cinematic)\b", re.I), "photo"),
    (re.compile(r"\b(anime|manga|2d|illustration)\b", re.I),                "pony"),
]

# Phrases that imply local-only editing → triggers inpaint mode and
# pulls out a noun phrase as the mask text.
INPAINT_PATTERNS = [
    re.compile(r"\b(?:change|recolor|edit|modify|replace|remove|delete|add)\s+(?:the|that|her|his|its)\s+([\w\s'-]{2,30}?)(?:\s+(?:to|into|with|so|that|and|,|\.)|$)", re.I),
    re.compile(r"\b(?:make|turn)\s+(?:the|that|her|his|its)\s+([\w\s'-]{2,30}?)\s+(?:bigger|smaller|larger|wider|taller|shorter|longer|brighter|darker|red|blue|green|yellow|orange|purple|pink|black|white|gold)", re.I),
    re.compile(r"\b(?:only|just)\s+(?:the|change the|edit the)\s+([\w\s'-]{2,30}?)(?:\s+|$)", re.I),
]


def _route_style(prompt: str) -> str:
    for pattern, style in ROUTING_RULES:
        if pattern.search(prompt):
            return style
    return DEFAULT_STYLE


def _detect_mask_text(prompt: str) -> Optional[str]:
    """Pull a noun phrase out of edit-style instructions for inpaint."""
    for pattern in INPAINT_PATTERNS:
        m = pattern.search(prompt)
        if m:
            obj = m.group(1).strip().rstrip(",.").strip()
            if obj:
                return f"the {obj}"
    return None


def _inherited_style(messages) -> Optional[str]:
    """Best-effort: read prior assistant message metadata for a style hint."""
    if not messages:
        return None
    for msg in reversed(messages):
        if not isinstance(msg, dict):
            continue
        # Look for a "style: X" comment in the assistant's previous text
        if msg.get("role") == "assistant":
            content = msg.get("content")
            if isinstance(content, str):
                m = re.search(r"\bstyle[:=]\s*([\w\-]+)", content)
                if m and m.group(1) in STYLES:
                    return m.group(1)
    return None


def _seed_value(seed: int) -> int:
    return seed if seed > 0 else int(time.time() * 1000) % (2**31)


def _build_txt2img(positive: str, negative: str, settings: dict,
                   width: int, height: int, seed: int) -> dict:
    return {
        "3": {"class_type": "KSampler", "inputs": {
            "seed": _seed_value(seed),
            "steps": settings["steps"], "cfg": settings["cfg"],
            "sampler_name": settings["sampler"], "scheduler": settings["scheduler"],
            "denoise": 1.0,
            "model": ["4", 0], "positive": ["6", 0],
            "negative": ["7", 0], "latent_image": ["5", 0],
        }},
        "4": {"class_type": "CheckpointLoaderSimple", "inputs": {"ckpt_name": settings["ckpt"]}},
        "5": {"class_type": "EmptyLatentImage",
              "inputs": {"width": width, "height": height, "batch_size": 1}},
        "6": {"class_type": "CLIPTextEncode", "inputs": {"text": positive, "clip": ["10", 0]}},
        "7": {"class_type": "CLIPTextEncode", "inputs": {"text": negative, "clip": ["10", 0]}},
        "8": {"class_type": "VAEDecode", "inputs": {"samples": ["3", 0], "vae": ["4", 2]}},
        "9": {"class_type": "SaveImage",
              "inputs": {"filename_prefix": "smartpipe", "images": ["8", 0]}},
        "10": {"class_type": "CLIPSetLastLayer",
               "inputs": {"stop_at_clip_layer": -settings["clip_skip"], "clip": ["4", 1]}},
    }


def _build_img2img(positive: str, negative: str, settings: dict,
                   image_filename: str, denoise: float, seed: int) -> dict:
    return {
        "3": {"class_type": "KSampler", "inputs": {
            "seed": _seed_value(seed),
            "steps": settings["steps"], "cfg": settings["cfg"],
            "sampler_name": settings["sampler"], "scheduler": settings["scheduler"],
            "denoise": denoise,
            "model": ["4", 0], "positive": ["6", 0],
            "negative": ["7", 0], "latent_image": ["11", 0],
        }},
        "4": {"class_type": "CheckpointLoaderSimple", "inputs": {"ckpt_name": settings["ckpt"]}},
        "6": {"class_type": "CLIPTextEncode", "inputs": {"text": positive, "clip": ["10", 0]}},
        "7": {"class_type": "CLIPTextEncode", "inputs": {"text": negative, "clip": ["10", 0]}},
        "8": {"class_type": "VAEDecode", "inputs": {"samples": ["3", 0], "vae": ["4", 2]}},
        "9": {"class_type": "SaveImage",
              "inputs": {"filename_prefix": "smartpipe", "images": ["8", 0]}},
        "10": {"class_type": "CLIPSetLastLayer",
               "inputs": {"stop_at_clip_layer": -settings["clip_skip"], "clip": ["4", 1]}},
        "11": {"class_type": "VAEEncode", "inputs": {"pixels": ["12", 0], "vae": ["4", 2]}},
        "12": {"class_type": "LoadImage", "inputs": {"image": image_filename}},
    }


def _build_inpaint(positive: str, negative: str, settings: dict,
                   image_filename: str, mask_text: str,
                   denoise: float, seed: int) -> dict:
    return {
        "3": {"class_type": "KSampler", "inputs": {
            "seed": _seed_value(seed),
            "steps": settings["steps"], "cfg": settings["cfg"],
            "sampler_name": settings["sampler"], "scheduler": settings["scheduler"],
            "denoise": denoise,
            "model": ["4", 0], "positive": ["6", 0],
            "negative": ["7", 0], "latent_image": ["13", 0],
        }},
        "4": {"class_type": "CheckpointLoaderSimple", "inputs": {"ckpt_name": settings["ckpt"]}},
        "6": {"class_type": "CLIPTextEncode", "inputs": {"text": positive, "clip": ["10", 0]}},
        "7": {"class_type": "CLIPTextEncode", "inputs": {"text": negative, "clip": ["10", 0]}},
        "8": {"class_type": "VAEDecode", "inputs": {"samples": ["3", 0], "vae": ["4", 2]}},
        "9": {"class_type": "SaveImage",
              "inputs": {"filename_prefix": "smartpipe", "images": ["8", 0]}},
        "10": {"class_type": "CLIPSetLastLayer",
               "inputs": {"stop_at_clip_layer": -settings["clip_skip"], "clip": ["4", 1]}},
        "11": {"class_type": "VAEEncode", "inputs": {"pixels": ["12", 0], "vae": ["4", 2]}},
        "12": {"class_type": "LoadImage", "inputs": {"image": image_filename}},
        "13": {"class_type": "SetLatentNoiseMask",
               "inputs": {"samples": ["11", 0], "mask": ["17", 0]}},
        "14": {"class_type": "SAMModelLoader (segment anything)",
               "inputs": {"model_name": "sam_hq_vit_h (2.57GB)"}},
        "15": {"class_type": "GroundingDinoModelLoader (segment anything)",
               "inputs": {"model_name": "GroundingDINO_SwinT_OGC (694MB)"}},
        "16": {"class_type": "GroundingDinoSAMSegment (segment anything)",
               "inputs": {
                   "sam_model": ["14", 0], "grounding_dino_model": ["15", 0],
                   "image": ["12", 0], "prompt": mask_text, "threshold": 0.3,
               }},
        "17": {"class_type": "GrowMask",
               "inputs": {"mask": ["16", 1], "expand": 12, "tapered_corners": True}},
    }


_FILE_URL_ID_RE = re.compile(r"/(?:api/v1/)?files/([0-9a-fA-F-]{8,})(?:/content)?")


def _file_dict_is_image(f: dict) -> bool:
    ftype = (f.get("type") or "").lower()
    fname = (f.get("name") or f.get("filename") or "").lower()
    return "image" in ftype or fname.endswith((".png", ".jpg", ".jpeg", ".webp"))


async def _read_file_dict(f: dict) -> Optional[bytes]:
    for path_key in ("path", "filepath", "file_path"):
        path = f.get(path_key)
        if path:
            try:
                with open(path, "rb") as fh:
                    return fh.read()
            except OSError:
                pass
    candidate_ids = []
    if f.get("id"):
        candidate_ids.append(f["id"])
    url = f.get("url")
    if url:
        m = _FILE_URL_ID_RE.search(url)
        if m:
            candidate_ids.append(m.group(1))
    if _OPENWEBUI_RUNTIME:
        for fid in candidate_ids:
            try:
                file_model = await Files.get_file_by_id(fid)
                if file_model is None:
                    continue
                path = getattr(file_model, "path", None)
                if not path:
                    meta = getattr(file_model, "meta", None) or {}
                    path = meta.get("path") if isinstance(meta, dict) else getattr(meta, "path", None)
                if path:
                    try:
                        with open(path, "rb") as fh:
                            return fh.read()
                    except OSError:
                        pass
            except Exception:
                pass
    return None


async def _extract_attached_image(files, messages, metadata, session) -> Optional[bytes]:
    # 1. Inline data URIs
    for msg in reversed(messages or []):
        content = msg.get("content") if isinstance(msg, dict) else None
        if isinstance(content, list):
            for block in content:
                if not isinstance(block, dict) or block.get("type") != "image_url":
                    continue
                url = (block.get("image_url") or {}).get("url", "")
                if url.startswith("data:image"):
                    try:
                        return base64.b64decode(url.split(",", 1)[1])
                    except Exception:
                        pass
    # 2. messages[].files
    for msg in reversed(messages or []):
        if not isinstance(msg, dict):
            continue
        for f in (msg.get("files") or []):
            if isinstance(f, dict) and _file_dict_is_image(f):
                data = await _read_file_dict(f)
                if data is not None:
                    return data
    # 3. __files__
    for f in files or []:
        if isinstance(f, dict) and _file_dict_is_image(f):
            data = await _read_file_dict(f)
            if data is not None:
                return data
    # 4. DB lookup (assistant-emitted files often only land here)
    if _OPENWEBUI_RUNTIME and metadata:
        chat_id = metadata.get("chat_id")
        if chat_id:
            try:
                chat = await Chats.get_chat_by_id(chat_id)
                chat_data = getattr(chat, "chat", None) if chat else None
                chat_messages = (chat_data or {}).get("messages", []) if isinstance(chat_data, dict) else []
                for msg in reversed(chat_messages):
                    for f in (msg.get("files") or []) if isinstance(msg, dict) else []:
                        if isinstance(f, dict) and _file_dict_is_image(f):
                            data = await _read_file_dict(f)
                            if data is not None:
                                return data
            except Exception:
                pass
    return None


async def _upload_to_comfyui(session, base, raw) -> Optional[str]:
    name = f"smartpipe_{uuid.uuid4().hex[:12]}.png"
    form = aiohttp.FormData()
    form.add_field("image", raw, filename=name, content_type="image/png")
    form.add_field("overwrite", "true")
    async with session.post(f"{base}/upload/image", data=form) as resp:
        if resp.status != 200:
            return None
        return (await resp.json()).get("name", name)


async def _push_image_to_chat(raw, prefix, request, user_dict, metadata, event_emitter) -> bool:
    if not (_OPENWEBUI_RUNTIME and request and user_dict and event_emitter):
        return False
    try:
        user = await Users.get_user_by_id(user_dict.get("id"))
        if not user:
            return False
        upload = UploadFile(
            file=io.BytesIO(raw),
            filename=f"{prefix}_{uuid.uuid4().hex[:8]}.png",
            headers={"content-type": "image/png"},
        )
        result = upload_file_handler(
            request=request, file=upload,
            metadata={"chat_id": (metadata or {}).get("chat_id"),
                      "message_id": (metadata or {}).get("message_id")},
            process=False, user=user,
        )
        file_item = await result if inspect.iscoroutine(result) else result
        url = request.app.url_path_for("get_file_content_by_id", id=file_item.id)
        await event_emitter({
            "type": "files",
            "data": {"files": [{"type": "image", "url": url}]},
        })
        return True
    except Exception:
        return False


async def _submit_and_fetch(session, base, workflow, timeout_seconds, emit, settings):
    SAVE_NODE_ID = "9"
    client_id = str(uuid.uuid4())
    async with session.post(
        f"{base}/prompt", json={"prompt": workflow, "client_id": client_id}
    ) as resp:
        if resp.status != 200:
            return None, f"ComfyUI rejected the prompt: {resp.status} {await resp.text()}"
        prompt_id = (await resp.json()).get("prompt_id")
        if not prompt_id:
            return None, "ComfyUI didn't return a prompt_id."

    await emit(
        f"Sampling — {settings['sampler']}/{settings['scheduler']}, "
        f"CFG {settings['cfg']}, {settings['steps']} steps"
    )
    deadline = time.time() + timeout_seconds
    output_images: list = []
    while time.time() < deadline:
        await asyncio.sleep(1.5)
        async with session.get(f"{base}/history/{prompt_id}") as resp:
            if resp.status != 200:
                continue
            history = await resp.json()
        if prompt_id in history:
            outputs = history[prompt_id].get("outputs", {}) or {}
            save_imgs = (outputs.get(SAVE_NODE_ID) or {}).get("images", [])
            if save_imgs:
                output_images.extend(save_imgs)
            if not output_images:
                for node_out in outputs.values():
                    output_images.extend(node_out.get("images", []))
            if output_images:
                break

    if not output_images:
        return None, f"Timed out after {timeout_seconds}s waiting for image."

    img = output_images[0]
    params = {
        "filename": img["filename"],
        "subfolder": img.get("subfolder", ""),
        "type": img.get("type", "output"),
    }
    async with session.get(f"{base}/view", params=params) as resp:
        if resp.status != 200:
            return None, f"Failed to fetch image: {resp.status}"
        return await resp.read(), None


def _extract_user_text(body: dict) -> str:
    """Pull the latest user message's text content."""
    messages = body.get("messages", [])
    for msg in reversed(messages):
        if not isinstance(msg, dict) or msg.get("role") != "user":
            continue
        content = msg.get("content")
        if isinstance(content, str):
            return content.strip()
        if isinstance(content, list):
            parts = []
            for block in content:
                if isinstance(block, dict) and block.get("type") == "text":
                    parts.append(block.get("text", ""))
            return " ".join(parts).strip()
    return ""


class Pipe:
    class Valves(BaseModel):
        COMFYUI_BASE_URL: str = Field(
            default="http://comfyui:8188",
            description="ComfyUI server URL reachable from the open-webui container.",
        )
        TIMEOUT_SECONDS: int = Field(default=600)
        DEFAULT_WIDTH: int = Field(default=1024)
        DEFAULT_HEIGHT: int = Field(default=1024)
        DEFAULT_DENOISE_IMG2IMG: float = Field(default=0.7)
        DEFAULT_DENOISE_INPAINT: float = Field(default=1.0)
        FORCE_STYLE: str = Field(
            default="",
            description="Override style routing. Empty = auto-route. Set to "
                        "one of: photo, juggernaut, pony, general, "
                        "furry-nai, furry-noob, furry-il.",
        )

    def __init__(self):
        self.valves = self.Valves()
        self.id = "image-studio-pipe"
        self.name = "Image Studio (Pipe)"

    async def pipe(
        self,
        body: dict,
        __user__: Optional[dict] = None,
        __request__=None,
        __metadata__: Optional[dict] = None,
        __event_emitter__: Optional[Callable[[dict], Awaitable[None]]] = None,
    ) -> str:
        user_text = _extract_user_text(body)
        if not user_text:
            return "Type a message describing the image you want."

        async def emit(msg: str, done: bool = False):
            if __event_emitter__:
                await __event_emitter__({
                    "type": "status",
                    "data": {"description": msg, "done": done},
                })

        # Style: explicit valve override > inherited from prior assistant
        # message > keyword detection on user text > default.
        chosen = (
            self.valves.FORCE_STYLE.strip()
            or _inherited_style(body.get("messages"))
            or _route_style(user_text)
        )
        if chosen not in STYLES:
            chosen = DEFAULT_STYLE
        settings = STYLES[chosen]

        base = self.valves.COMFYUI_BASE_URL.rstrip("/")
        positive = f"{settings['prefix']}{user_text}"
        negative = settings["negative"]

        async with aiohttp.ClientSession() as session:
            await emit("Looking for attached image…")
            source_bytes = await _extract_attached_image(
                None, body.get("messages"), __metadata__, session,
            )

            if source_bytes is None:
                # No image → txt2img
                await emit(f"Generating ({chosen})")
                workflow = _build_txt2img(
                    positive, negative, settings,
                    self.valves.DEFAULT_WIDTH, self.valves.DEFAULT_HEIGHT, 0,
                )
                tag = "gen"
            else:
                # Image present → upload, then inpaint or img2img
                uploaded = await _upload_to_comfyui(session, base, source_bytes)
                if not uploaded:
                    return "Failed to upload source image to ComfyUI."

                mask_text = _detect_mask_text(user_text)
                if mask_text:
                    await emit(
                        f"Inpainting ({chosen}, mask='{mask_text}', "
                        f"denoise={self.valves.DEFAULT_DENOISE_INPAINT})"
                    )
                    workflow = _build_inpaint(
                        positive, negative, settings, uploaded, mask_text,
                        self.valves.DEFAULT_DENOISE_INPAINT, 0,
                    )
                    tag = f"edit (inpaint: {mask_text})"
                else:
                    await emit(
                        f"Editing ({chosen}, "
                        f"denoise={self.valves.DEFAULT_DENOISE_IMG2IMG})"
                    )
                    workflow = _build_img2img(
                        positive, negative, settings, uploaded,
                        self.valves.DEFAULT_DENOISE_IMG2IMG, 0,
                    )
                    tag = "edit (img2img)"

            raw, err = await _submit_and_fetch(
                session, base, workflow, self.valves.TIMEOUT_SECONDS, emit, settings,
            )
        if err:
            return err

        await _push_image_to_chat(
            raw, "smartpipe", __request__, __user__, __metadata__, __event_emitter__,
        )
        await emit(f"Done — {chosen}", done=True)

        # Single-line plain-English follow-up. Emit the style as
        # "style: <name>" so the inheritance helper can find it next turn.
        return f"Done — style: {chosen}, {tag}."