smart_image_gen v0.2: per-style sampler/CFG/steps/CLIP-skip + prompt prefixes

Researched each of the seven SDXL checkpoints on Civitai and encoded the creator-recommended generation defaults per style instead of one global set. Material differences: - photo (CyberRealistic): dpmpp_2m_sde / karras / CFG 4 / 28 steps / CLIP 1 - juggernaut: dpmpp_2m_sde / karras / CFG 4.5 / 35 steps / CLIP 1 - pony: euler_a / normal / CFG 7.5 / 25 steps / CLIP 2 - general (Talmendo): dpmpp_2m / karras / CFG 8 / 30 steps / CLIP 2 - furry-nai (Reed): euler_a / normal / CFG 5 / 30 steps / CLIP 2 - furry-noob (IndigoVoid): euler_a-only / normal / CFG 4.5 / 20 / CLIP 2 - furry-il (NovaFurry): euler_a / normal / CFG 4 / 30 steps / CLIP 2 Three prompt-prefix dialects auto-prepended (NEVER cross-contaminated): photoreal models get nothing, Pony gets the full score_9..score_4_up chain (mandatory), and the NoobAI/Illustrious furry models get their booru quality + year-tag prefixes (masterpiece/best quality/absurdres/newest/etc). Workflow now includes a CLIPSetLastLayer node so per-style CLIP skip works. Routing default for generic "furry" flipped from Reed (NAI) to NovaFurry (Illustrious) — current sweet-spot consensus. Removed global DEFAULT_STEPS/DEFAULT_CFG valves; per-style values are canonical. Sources: each model's Civitai page (CyberRealisticXL, Juggernaut, Pony V6 XL, TalmendoXL, Reed FurryMix, IndigoVoid FurryFused, NovaFurryXL) and Pony/Illustrious prompting guides. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-19 12:45:34 -05:00
parent cd0034cd99
commit 45d5541be0
1 changed files with 220 additions and 105 deletions
--- a/deployments/ai-stack/openwebui-tools/smart_image_gen.py
+++ b/deployments/ai-stack/openwebui-tools/smart_image_gen.py
@@ -1,10 +1,13 @@
 """
 title: Smart Image Generator (ComfyUI)
 author: ai-stack
-version: 0.1.0
+version: 0.2.0
 description: Generate images via ComfyUI with automatic SDXL checkpoint
    routing. The LLM picks (or auto-detects) the right model — photoreal,
-    anime/score-tag, furry-IL, etc. — based on the user's request.
+    Pony score-tag, NoobAI/Illustrious furry, etc. — based on the user's
+    request. Each style ships with the creator-recommended sampler,
+    scheduler, CFG, steps, CLIP skip, prompt-prefix dialect, and
+    negatives.
 required_open_webui_version: 0.5.0
 """

@@ -19,94 +22,171 @@ import aiohttp
 from pydantic import BaseModel, Field


-# Filename → use case. Edit alongside `comfyui-init-models.sh` so the
-# files actually exist in /opt/comfyui/models/checkpoints/.
-CHECKPOINTS = {
-    "photo":      "CyberRealisticXLPlay_V8.0_FP16.safetensors",
-    "juggernaut": "Juggernaut-XL_v9_RunDiffusionPhoto_v2.safetensors",
-    "pony":       "ponyDiffusionV6XL_v6StartWithThisOne.safetensors",
-    "general":    "talmendoxlSDXL_v11Beta.safetensors",
-    "furry-nai":  "reedFURRYMixSDXL_v23nai.safetensors",
-    "furry-noob": "indigoVoidFurryFusedXL_noobaiV32.safetensors",
-    "furry-il":   "novaFurryXL_ilV170.safetensors",
+# ─────────────────────────────────────────────────────────────────────────────
+# Per-style settings — sampler/scheduler/cfg/steps/clip_skip/prefix/negatives
+# come from each model's creator page on Civitai. Three prefix dialects in
+# play: photoreal (no prefix, natural language), Pony score chain (REQUIRED
+# for any Pony-derived checkpoint), and Booru quality tags (NoobAI /
+# Illustrious lineage). Never cross-contaminate.
+# ─────────────────────────────────────────────────────────────────────────────
+
+STYLES = {
+    "photo": {
+        "ckpt":      "CyberRealisticXLPlay_V8.0_FP16.safetensors",
+        "sampler":   "dpmpp_2m_sde",
+        "scheduler": "karras",
+        "cfg":       4.0,
+        "steps":     28,
+        "clip_skip": 1,
+        "prefix":    "",  # natural language only — no quality tags
+        "negative": (
+            "cartoon, drawing, illustration, anime, manga, painting, sketch, "
+            "render, 3d, cgi, watercolor, plastic skin, doll-like, oversaturated, "
+            "lowres, blurry, jpeg artifacts, noisy, grainy, low quality, worst quality, "
+            "bad anatomy, deformed, mutated, extra limbs, extra fingers, missing fingers, "
+            "fused fingers, malformed hands, asymmetric face, "
+            "watermark, signature, text, logo, label, username"
+        ),
+    },
+    "juggernaut": {
+        "ckpt":      "Juggernaut-XL_v9_RunDiffusionPhoto_v2.safetensors",
+        "sampler":   "dpmpp_2m_sde",
+        "scheduler": "karras",
+        "cfg":       4.5,
+        "steps":     35,
+        "clip_skip": 1,
+        "prefix":    "",  # natural language only
+        "negative": (
+            "cartoon, drawing, illustration, anime, manga, painting, sketch, "
+            "render, 3d, cgi, plastic skin, washed out, oversaturated, "
+            "lowres, blurry, jpeg artifacts, low quality, worst quality, "
+            "bad anatomy, deformed, mutated, extra limbs, extra fingers, missing fingers, "
+            "fused fingers, malformed hands, "
+            "watermark, signature, text, logo, username"
+        ),
+    },
+    "pony": {
+        "ckpt":      "ponyDiffusionV6XL_v6StartWithThisOne.safetensors",
+        "sampler":   "euler_ancestral",
+        "scheduler": "normal",
+        "cfg":       7.5,
+        "steps":     25,
+        "clip_skip": 2,
+        # REQUIRED — the full chain. Just `score_9` alone is much weaker.
+        "prefix":    "score_9, score_8_up, score_7_up, score_6_up, score_5_up, score_4_up, ",
+        # Pony's creator notes negatives are usually unnecessary; conservative
+        # baseline only. Source-toggle tags (source_pony/furry/anime/cartoon)
+        # are intentionally omitted — they exclude entire content domains.
+        "negative": (
+            "score_6, score_5, score_4, "
+            "worst quality, low quality, lowres, blurry, jpeg artifacts, noisy, "
+            "bad anatomy, bad proportions, bad hands, extra digit, fewer digits, "
+            "fused fingers, malformed limbs, deformed, ugly, "
+            "censored, monochrome, "
+            "watermark, signature, text, logo, artist name, patreon username, twitter username"
+        ),
+    },
+    "general": {
+        "ckpt":      "talmendoxlSDXL_v11Beta.safetensors",
+        "sampler":   "dpmpp_2m",
+        "scheduler": "karras",
+        "cfg":       8.0,  # Talmendo wants notably higher CFG than the others
+        "steps":     30,
+        "clip_skip": 2,
+        "prefix":    "",  # creator says don't push "masterpiece" — fights the amateur aesthetic
+        "negative": (
+            "lowres, blurry, jpeg artifacts, noisy, grainy, low quality, worst quality, "
+            "bad anatomy, deformed, mutated, extra limbs, missing fingers, fused fingers, "
+            "malformed hands, ugly, "
+            "watermark, signature, text, logo"
+        ),
+    },
+    "furry-nai": {
+        "ckpt":      "reedFURRYMixSDXL_v23nai.safetensors",
+        "sampler":   "euler_ancestral",
+        "scheduler": "normal",
+        "cfg":       5.0,
+        "steps":     30,
+        "clip_skip": 2,
+        "prefix": (
+            "masterpiece, best quality, high quality, good quality, "
+            "detailed eyes, highres, absurdres, furry, "
+        ),
+        "negative": (
+            "human, realistic, photorealistic, 3d, cgi, "
+            "worst quality, bad_quality, normal quality, lowres, "
+            "anatomical nonsense, bad anatomy, interlocked fingers, extra fingers, "
+            "bad_feet, bad_hands, deformed anatomy, bad proportions, "
+            "censored, simple background, transparent, face backlighting, "
+            "watermark, signature, text, logo, username, jpeg artifacts"
+        ),
+    },
+    "furry-noob": {
+        "ckpt":      "indigoVoidFurryFusedXL_noobaiV32.safetensors",
+        "sampler":   "euler_ancestral",  # creator: other samplers won't work
+        "scheduler": "normal",
+        "cfg":       4.5,
+        "steps":     20,
+        "clip_skip": 2,
+        "prefix": (
+            "masterpiece, best quality, perfect quality, absurdres, newest, "
+            "very aesthetic, vibrant colors, "
+        ),
+        "negative": (
+            "human, realistic, photorealistic, 3d, cgi, "
+            "shiny skin, shiny clothing, "
+            "worst quality, low quality, lowres, blurry, jpeg artifacts, noisy, "
+            "bad anatomy, bad hands, mutated hands, bad proportions, "
+            "extra digit, fewer digits, fused fingers, malformed limbs, deformed, ugly, "
+            "watermark, signature, text, logo, username, artist signature"
+        ),
+    },
+    "furry-il": {
+        "ckpt":      "novaFurryXL_ilV170.safetensors",
+        "sampler":   "euler_ancestral",
+        "scheduler": "normal",
+        "cfg":       4.0,
+        "steps":     30,
+        "clip_skip": 2,
+        # Illustrious wants `newest` in positive and `old`/`oldest` in negative
+        # — these are year-bucket tags from the training set. `furry` and
+        # `anthro` are universally helpful here.
+        "prefix": (
+            "masterpiece, best quality, amazing quality, very aesthetic, "
+            "high resolution, ultra-detailed, absurdres, newest, furry, anthro, "
+        ),
+        "negative": (
+            "human, multiple tails, modern, recent, old, oldest, "
+            "graphic, cartoon, painting, crayon, graphite, abstract, glitch, "
+            "deformed, mutated, ugly, disfigured, long body, conjoined, "
+            "lowres, bad anatomy, bad hands, missing fingers, extra digits, fewer digits, "
+            "cropped, very displeasing, worst quality, bad quality, sketch, "
+            "jpeg artifacts, signature, watermark, username, text, simple background, "
+            "bad ai-generated"
+        ),
+    },
 }

-# Style-specific negative prompts. Always applied as the baseline; whatever
-# the caller supplies via `negative_prompt` is appended on top.
-#
-# Quality-focused only — no NSFW or content filtering by default. If you
-# want SFW-by-default, add an explicit safe-mode flag rather than baking
-# content terms in here (some checkpoints in this set are commonly used
-# for adult work and would fight the negative).
-NEGATIVES = {
-    "photo": (
-        "cartoon, drawing, illustration, anime, manga, painting, sketch, "
-        "render, 3d, cgi, watercolor, plastic skin, doll-like, oversaturated, "
-        "lowres, blurry, jpeg artifacts, noisy, grainy, low quality, worst quality, "
-        "bad anatomy, deformed, mutated, extra limbs, extra fingers, missing fingers, "
-        "fused fingers, malformed hands, asymmetric face, "
-        "watermark, signature, text, logo, username"
-    ),
-    "juggernaut": (
-        "cartoon, drawing, illustration, anime, manga, painting, sketch, "
-        "render, 3d, cgi, plastic skin, washed out, oversaturated, "
-        "lowres, blurry, jpeg artifacts, low quality, worst quality, "
-        "bad anatomy, deformed, mutated, extra limbs, extra fingers, missing fingers, "
-        "fused fingers, malformed hands, "
-        "watermark, signature, text, logo, username"
-    ),
-    "pony": (
-        "score_6, score_5, score_4, "
-        "worst quality, low quality, lowres, blurry, jpeg artifacts, noisy, "
-        "bad anatomy, bad proportions, bad hands, extra digit, fewer digits, "
-        "fused fingers, malformed limbs, deformed, ugly, "
-        "censored, monochrome, "
-        "watermark, signature, text, logo, artist name, patreon username, twitter username"
-    ),
-    "general": (
-        "lowres, blurry, jpeg artifacts, noisy, grainy, low quality, worst quality, "
-        "bad anatomy, deformed, mutated, extra limbs, missing fingers, fused fingers, "
-        "malformed hands, ugly, "
-        "watermark, signature, text, logo"
-    ),
-    "furry-nai": (
-        "human, realistic, photorealistic, 3d, cgi, "
-        "worst quality, low quality, lowres, blurry, jpeg artifacts, noisy, "
-        "bad anatomy, bad proportions, extra digit, fewer digits, fused fingers, "
-        "malformed limbs, deformed, ugly, "
-        "watermark, signature, text, logo, artist signature, patreon username"
-    ),
-    "furry-noob": (
-        "human, realistic, photorealistic, 3d, cgi, "
-        "worst quality, low quality, lowres, blurry, jpeg artifacts, noisy, "
-        "bad anatomy, bad proportions, extra digit, fewer digits, fused fingers, "
-        "malformed limbs, deformed, ugly, "
-        "watermark, signature, text, logo, artist signature, patreon username"
-    ),
-    "furry-il": (
-        "human, realistic, photorealistic, 3d, cgi, "
-        "worst quality, low quality, lowres, blurry, jpeg artifacts, noisy, "
-        "bad anatomy, bad proportions, extra digit, fewer digits, fused fingers, "
-        "malformed limbs, deformed, ugly, "
-        "watermark, signature, text, logo, artist signature, patreon username"
-    ),
-}
+DEFAULT_STYLE = "general"

 # First-match-wins keyword router used when the caller didn't pick a style.
 # Order matters — narrower patterns above broader ones.
 ROUTING_RULES = [
+    # Pony score chain is the single strongest signal — Pony only
    (re.compile(r"\bscore_\d", re.I),                                       "pony"),
    (re.compile(r"\bpony\b",   re.I),                                       "pony"),
+    # NoobAI / Illustrious explicit mentions
    (re.compile(r"\b(noobai|noob)\b", re.I),                                "furry-noob"),
    (re.compile(r"\b(illustrious|ilxl)\b", re.I),                           "furry-il"),
-    (re.compile(r"\b(furry|anthro|feral|kemono|fursona|species)\b", re.I),  "furry-nai"),
+    # Generic furry — defaults to NovaFurry (Illustrious lineage, current sweet spot)
+    (re.compile(r"\b(furry|anthro|feral|kemono|fursona|species)\b", re.I),  "furry-il"),
+    # Photo / photoreal
    (re.compile(r"\b(juggernaut)\b", re.I),                                 "juggernaut"),
    (re.compile(r"\b(photo|photograph|realistic|portrait|selfie|cinematic)\b", re.I), "photo"),
+    # Generic anime / illustration → Pony covers anime well
    (re.compile(r"\b(anime|manga|2d|illustration)\b", re.I),                "pony"),
 ]

-DEFAULT_STYLE = "general"
-

 def _route_style(prompt: str) -> str:
    for pattern, style in ROUTING_RULES:
@@ -115,24 +195,41 @@ def _route_style(prompt: str) -> str:
    return DEFAULT_STYLE


-def _build_workflow(prompt, negative, ckpt, width, height, steps, cfg, seed):
+def _build_workflow(positive: str, negative: str, settings: dict,
+                    width: int, height: int, seed: int) -> dict:
+    """
+    Construct an SDXL txt2img workflow. CLIP skip is implemented via a
+    CLIPSetLastLayer node so the same graph works for skip 1 (-1) and
+    skip 2 (-2).
+    """
    return {
        "3": {"class_type": "KSampler", "inputs": {
            "seed": seed if seed > 0 else int(time.time() * 1000) % (2**31),
-            "steps": steps, "cfg": cfg,
-            "sampler_name": "dpmpp_2m", "scheduler": "karras",
+            "steps": settings["steps"],
+            "cfg": settings["cfg"],
+            "sampler_name": settings["sampler"],
+            "scheduler": settings["scheduler"],
            "denoise": 1.0,
-            "model": ["4", 0], "positive": ["6", 0],
-            "negative": ["7", 0], "latent_image": ["5", 0],
+            "model": ["4", 0],
+            "positive": ["6", 0],
+            "negative": ["7", 0],
+            "latent_image": ["5", 0],
        }},
-        "4": {"class_type": "CheckpointLoaderSimple", "inputs": {"ckpt_name": ckpt}},
+        "4": {"class_type": "CheckpointLoaderSimple",
+              "inputs": {"ckpt_name": settings["ckpt"]}},
        "5": {"class_type": "EmptyLatentImage",
              "inputs": {"width": width, "height": height, "batch_size": 1}},
-        "6": {"class_type": "CLIPTextEncode", "inputs": {"text": prompt,   "clip": ["4", 1]}},
-        "7": {"class_type": "CLIPTextEncode", "inputs": {"text": negative, "clip": ["4", 1]}},
-        "8": {"class_type": "VAEDecode", "inputs": {"samples": ["3", 0], "vae": ["4", 2]}},
+        "6": {"class_type": "CLIPTextEncode",
+              "inputs": {"text": positive, "clip": ["10", 0]}},
+        "7": {"class_type": "CLIPTextEncode",
+              "inputs": {"text": negative, "clip": ["10", 0]}},
+        "8": {"class_type": "VAEDecode",
+              "inputs": {"samples": ["3", 0], "vae": ["4", 2]}},
        "9": {"class_type": "SaveImage",
              "inputs": {"filename_prefix": "smartgen", "images": ["8", 0]}},
+        "10": {"class_type": "CLIPSetLastLayer",
+               "inputs": {"stop_at_clip_layer": -settings["clip_skip"],
+                          "clip": ["4", 1]}},
    }


@@ -142,10 +239,8 @@ class Tools:
            default="http://comfyui:8188",
            description="ComfyUI server URL reachable from the open-webui container.",
        )
-        DEFAULT_STEPS: int = Field(default=25, description="KSampler steps.")
-        DEFAULT_CFG: float = Field(default=7.0, description="CFG scale.")
        TIMEOUT_SECONDS: int = Field(
-            default=180,
+            default=240,
            description="Maximum wait for a single generation to complete.",
        )

@@ -163,33 +258,51 @@ class Tools:
        __event_emitter__: Optional[Callable[[dict], Awaitable[None]]] = None,
    ) -> str:
        """
-        Generate an image with the right SDXL checkpoint for the request.
+        Generate an image with the right SDXL checkpoint and creator-
+        recommended sampler/CFG/steps/prompt-format for the request.

        Pick `style` based on what the user wants:
        - "photo": photorealistic photographs, portraits, cinematic shots.
+            Uses CyberRealisticXL — natural-language prompts, no quality tags.
        - "juggernaut": versatile photoreal alternative — sharper, more saturated.
-        - "pony": anime / illustration with score tags (score_9, score_8_up, ...).
-        - "general": catch-all SDXL when no specific style applies.
+            Uses Juggernaut-XL — natural-language prompts, no quality tags.
+        - "pony": anime / illustration with Pony's score-tag prompt format.
+            Uses Pony Diffusion V6 XL — score_9..score_4_up chain auto-prepended.
+            Best for anime, cartoon, and stylised art.
+        - "general": amateur-photo aesthetic, catch-all SDXL.
+            Uses TalmendoXL — natural-language prompts, higher CFG.
        - "furry-nai": anthropomorphic characters, NAI-trained mix.
+            Uses reedFURRYMix — booru quality tags auto-prepended.
        - "furry-noob": anthropomorphic characters, NoobAI base.
+            Uses IndigoVoid FurryFused — booru quality tags auto-prepended.
        - "furry-il": anthropomorphic characters, Illustrious base.
+            Uses NovaFurryXL — booru quality + year tags auto-prepended.
+            Default for unspecified furry/anthro requests.

        If `style` is omitted, the tool auto-detects from `prompt` keywords.
+        Each style has its own creator-recommended sampler, CFG, steps, and
+        CLIP skip — you don't need to override any of these.

-        :param prompt: The image description.
+        :param prompt: The image description. Style-appropriate quality
+            tags (Pony score chain, Booru masterpiece chain, etc.) are
+            prepended automatically — don't include them in `prompt`.
        :param style: One of the keys above. Omit to auto-route.
-        :param negative_prompt: Extra negatives appended to the style default.
-        :param width: Output width in pixels (default 1024, SDXL native).
-        :param height: Output height in pixels (default 1024, SDXL native).
+        :param negative_prompt: Extra negatives appended to the per-style
+            baseline. Usually unneeded — each style ships with a tuned
+            negative.
+        :param width: Output width in pixels (default 1024 — SDXL native).
+            Use 832 for portraits with height 1216, or 1216 with height 832
+            for landscapes.
+        :param height: Output height in pixels (default 1024).
        :param seed: Specific seed, or 0 to randomize.
        :return: Markdown embedding the generated image.
        """
        chosen = style or _route_style(prompt)
-        ckpt = CHECKPOINTS.get(chosen)
-        if not ckpt:
+        settings = STYLES.get(chosen)
+        if not settings:
            return (
                f"Unknown style '{chosen}'. "
-                f"Available: {', '.join(CHECKPOINTS.keys())}"
+                f"Available: {', '.join(STYLES.keys())}"
            )

        async def emit(msg: str, done: bool = False):
@@ -199,20 +312,19 @@ class Tools:
                    "data": {"description": msg, "done": done},
                })

-        await emit(f"Routing to {chosen} ({ckpt})")
+        await emit(f"Routing to {chosen} ({settings['ckpt']})")

-        negative = NEGATIVES.get(chosen, "")
+        positive = f"{settings['prefix']}{prompt}"
+        negative = settings["negative"]
        if negative_prompt:
-            negative = f"{negative}, {negative_prompt}" if negative else negative_prompt
+            negative = f"{negative}, {negative_prompt}"

        workflow = _build_workflow(
-            prompt=prompt,
+            positive=positive,
            negative=negative,
-            ckpt=ckpt,
+            settings=settings,
            width=width,
            height=height,
-            steps=self.valves.DEFAULT_STEPS,
-            cfg=self.valves.DEFAULT_CFG,
            seed=seed,
        )

@@ -231,7 +343,10 @@ class Tools:
                if not prompt_id:
                    return f"ComfyUI didn't return a prompt_id: {submit}"

-            await emit("Queued, sampling…")
+            await emit(
+                f"Sampling — {settings['sampler']}/{settings['scheduler']}, "
+                f"CFG {settings['cfg']}, {settings['steps']} steps"
+            )

            deadline = time.time() + self.valves.TIMEOUT_SECONDS
            output_images = []