Compare commits
42 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| c07e962cae | |||
| d034700af9 | |||
| 28f370a80b | |||
| 02a4bece5d | |||
| a1af88a632 | |||
| d8c8421361 | |||
| f5a222fe6f | |||
| ec6108888a | |||
| 20d4bd5b72 | |||
| 1ae451ad5f | |||
| 011fade024 | |||
| 63917709c1 | |||
| 1ed2e7293e | |||
| 18a205d69d | |||
| 0fa8040251 | |||
| e77666ea0f | |||
| 6700f6ce33 | |||
| def27087c1 | |||
| 2cecf77981 | |||
| f26dfbee02 | |||
| 06433d3815 | |||
| 780ce42711 | |||
| f6f5690fcd | |||
| d935e24624 | |||
| 7c7897818e | |||
| a1fca4d5d9 | |||
| 0b1c2ee5b5 | |||
| 5a34ced8f1 | |||
| f77f5993fb | |||
| b604e3f509 | |||
| 4d996e1205 | |||
| 6adf133558 | |||
| d4e2058859 | |||
| 41d571d8d1 | |||
| 9e22de0328 | |||
| b815cd6a5f | |||
| 45d5541be0 | |||
| cd0034cd99 | |||
| c585e53ed4 | |||
| 392b26167f | |||
| 704bcfdf13 | |||
| 0ad99b6199 |
22
Dockerfile
22
Dockerfile
@@ -52,8 +52,30 @@ RUN git clone --depth 1 https://github.com/ltdrdata/ComfyUI-Manager.git \
|
||||
${COMFYUI_HOME}/custom_nodes/ComfyUI-Manager && \
|
||||
pip install -r ${COMFYUI_HOME}/custom_nodes/ComfyUI-Manager/requirements.txt
|
||||
|
||||
# comfyui_segment_anything — GroundingDINO + SAM-HQ in one bundle. Required
|
||||
# by the smart_image_gen Tool's text-targeted inpainting (edit_image with the
|
||||
# mask_text parameter). Model weights auto-download on first use into
|
||||
# /opt/comfyui/models/{sams,grounding-dino}/ — first inpaint takes ~3 GB of
|
||||
# downloads, subsequent runs are instant.
|
||||
#
|
||||
# Transformers must stay <5: GroundingDINO inside this node calls
|
||||
# BertModel.get_head_mask, which transformers 5.0 silently removed. The pin
|
||||
# is applied AFTER the requirements install so it overrides anything the
|
||||
# upstream requirements.txt would have pulled.
|
||||
RUN git clone --depth 1 https://github.com/storyicon/comfyui_segment_anything.git \
|
||||
${COMFYUI_HOME}/custom_nodes/comfyui_segment_anything && \
|
||||
pip install -q -r ${COMFYUI_HOME}/custom_nodes/comfyui_segment_anything/requirements.txt && \
|
||||
pip install -q "transformers>=4.40,<5"
|
||||
|
||||
# Entrypoint wrapper — auto-installs requirements.txt for any custom_node
|
||||
# present at startup (covers Manager-installed nodes and nodes cloned
|
||||
# directly into the comfyui-custom-nodes volume).
|
||||
COPY install-custom-node-deps.sh /usr/local/bin/install-custom-node-deps.sh
|
||||
RUN chmod +x /usr/local/bin/install-custom-node-deps.sh
|
||||
|
||||
EXPOSE 8188
|
||||
|
||||
# --listen 0.0.0.0 binds to every interface so the Open WebUI container on the
|
||||
# shared compose network can reach it. --port is explicit for clarity.
|
||||
ENTRYPOINT ["/usr/local/bin/install-custom-node-deps.sh"]
|
||||
CMD ["python", "main.py", "--listen", "0.0.0.0", "--port", "8188"]
|
||||
|
||||
@@ -19,6 +19,36 @@ WEBUI_SECRET_KEY=replace-with-32-byte-hex
|
||||
# Only needed if you uncomment the anubis-owui service in docker-compose.yml.
|
||||
ANUBIS_OWUI_KEY=replace-with-32-byte-hex
|
||||
|
||||
# ComfyUI image tag to deploy. `latest` tracks whatever the release workflow
|
||||
# last pushed; pin to a v* tag (e.g. 0.1.0) for reproducible deploys.
|
||||
COMFYUI_IMAGE_TAG=latest
|
||||
# ─── Image tags ─────────────────────────────────────────────────────────────
|
||||
# Pin to specific versions for reproducible deploys. The defaults below are
|
||||
# the last set verified to work end-to-end for this stack — change only when
|
||||
# you've tested a newer combination. `latest` / `main` is fine for local
|
||||
# experimentation but means deploys are non-deterministic.
|
||||
#
|
||||
# Find current tags at:
|
||||
# ComfyUI git.anomalous.dev/alphacentri/comfyui-nvidia/-/tags
|
||||
# Caddy https://hub.docker.com/_/caddy/tags
|
||||
# Ollama https://hub.docker.com/r/ollama/ollama/tags
|
||||
# Open WebUI https://github.com/open-webui/open-webui/pkgs/container/open-webui
|
||||
# Alpine https://hub.docker.com/_/alpine/tags
|
||||
# Anubis https://github.com/TecharoHQ/anubis/pkgs/container/anubis
|
||||
|
||||
COMFYUI_IMAGE_TAG=0.2.1
|
||||
CADDY_TAG=2-alpine
|
||||
OLLAMA_TAG=latest
|
||||
OPEN_WEBUI_TAG=main
|
||||
ALPINE_TAG=3.20
|
||||
ANUBIS_TAG=latest
|
||||
|
||||
# HuggingFace access token. Only needed if comfyui-init-models.sh references
|
||||
# gated repos (Flux-dev, SD3, etc.). Generate a read token at
|
||||
# https://huggingface.co/settings/tokens. Leave empty for public-only.
|
||||
HF_TOKEN=
|
||||
|
||||
# HTTPS base URL of an S3 bucket / CDN that hosts mirrored Ollama model
|
||||
# tarballs (created by mirror-ollama-model.sh). Files under this base are
|
||||
# fetched by init-models.sh's s3_pull instead of registry.ollama.ai —
|
||||
# faster and immune to upstream rate-limiting / removal. Example:
|
||||
# S3_OLLAMA_BASE=https://your-bucket.s3.amazonaws.com/ollama-models
|
||||
# Leave empty to fall back to plain `ollama pull` for everything.
|
||||
S3_OLLAMA_BASE=
|
||||
|
||||
@@ -10,12 +10,17 @@ production `srvno.de` deployment.
|
||||
|
||||
## Files
|
||||
|
||||
| File | Purpose |
|
||||
| ------------------- | -------------------------------------------------------- |
|
||||
| `docker-compose.yml`| Service definitions, volumes, GPU reservations |
|
||||
| `Caddyfile` | TLS + reverse proxy config (one site block per hostname) |
|
||||
| `init-models.sh` | Models to preseed into Ollama on first boot |
|
||||
| `.env.example` | Secrets and image-tag pins. Copy to `.env` |
|
||||
| File | Purpose |
|
||||
| --------------------------------------- | -------------------------------------------------------- |
|
||||
| `docker-compose.yml` | Service definitions, volumes, GPU reservations |
|
||||
| `Caddyfile` | TLS + reverse proxy config (one site block per hostname) |
|
||||
| `init-models.sh` | LLMs to preseed into Ollama on first boot |
|
||||
| `mirror-ollama-model.sh` | Helper — mirror an Ollama model into a tarball you can host on S3 |
|
||||
| `comfyui-init-models.sh` | Checkpoints/VAEs/LoRAs to preseed into ComfyUI on first boot |
|
||||
| `openwebui-tools/smart_image_gen.py` | Tool that auto-routes image generation, img2img, and text-targeted inpainting to the right SDXL checkpoint |
|
||||
| `openwebui-models/image_studio.md` | Dedicated chat-model preset — manual setup walkthrough |
|
||||
| `openwebui-models/image_studio.json` | The same preset as an importable Open WebUI model JSON |
|
||||
| `.env.example` | Secrets and image-tag pins. Copy to `.env` |
|
||||
|
||||
## 1. Host prerequisites
|
||||
|
||||
@@ -60,7 +65,20 @@ Then edit:
|
||||
```
|
||||
- **`init-models.sh`** — keep the LLMs you want preseeded, drop the rest.
|
||||
Check sizes at <https://ollama.com/library> first; the host needs disk
|
||||
for everything listed.
|
||||
for everything listed. Two pull paths are available:
|
||||
- `pull "<model:tag>"` — standard registry pull from
|
||||
`registry.ollama.ai`.
|
||||
- `s3_pull "<model:tag>" "<archive.tgz>"` — fetches from your own
|
||||
mirror set via `S3_OLLAMA_BASE` in `.env`. Falls back to
|
||||
`ollama pull` if the env var isn't set, so this is safe to enable
|
||||
incrementally. Create the tarballs once with
|
||||
`mirror-ollama-model.sh` (see [Mirroring models to S3](#mirroring-models-to-s3)).
|
||||
- **`comfyui-init-models.sh`** — checkpoints/VAEs/LoRAs to preseed into
|
||||
ComfyUI. Ships empty (no active fetches) — uncomment the SDXL/Flux/
|
||||
upscaler examples or add your own. Whatever filename you pick should
|
||||
match the `ckpt_name` field in `workflows/*.json` (default expects
|
||||
`CyberRealisticXLPlay_V8.0_FP16.safetensors`). Set `HF_TOKEN` in
|
||||
`.env` if any are gated repos.
|
||||
|
||||
## 3. Bring it up
|
||||
|
||||
@@ -80,22 +98,29 @@ docker compose exec comfyui curl -sf http://127.0.0.1:8188/system_stats | head -
|
||||
docker compose exec open-webui curl -sf http://127.0.0.1:8080/health
|
||||
```
|
||||
|
||||
## 4. Drop in at least one ComfyUI checkpoint
|
||||
## 4. ComfyUI checkpoints
|
||||
|
||||
ComfyUI ships no models. The shipped workflow templates reference
|
||||
`v1-5-pruned-emaonly.safetensors` as a placeholder; drop any
|
||||
SD/SDXL/Flux checkpoint into the `comfyui-models` volume under
|
||||
`checkpoints/`:
|
||||
ComfyUI ships no models. Three ways to get one in:
|
||||
|
||||
```sh
|
||||
docker run --rm -v ai-stack_comfyui-models:/models -w /models/checkpoints \
|
||||
curlimages/curl:latest -L -O \
|
||||
https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.safetensors
|
||||
```
|
||||
1. **Preseed via the sidecar (default).** `comfyui-model-init` runs once
|
||||
on `compose up`, downloads everything `comfyui-init-models.sh` lists,
|
||||
and exits. The script ships empty — uncomment one of the examples or
|
||||
add your own `fetch` calls (SDXL, Flux, LoRAs, upscalers, etc.). At
|
||||
least one checkpoint should be named
|
||||
`CyberRealisticXLPlay_V8.0_FP16.safetensors` to match the workflow
|
||||
default, or update `ckpt_name` in `workflows/*.json` to whatever you
|
||||
pull. Re-run with `docker compose up -d comfyui-model-init` after
|
||||
script edits; already-present files are skipped.
|
||||
2. **ComfyUI-Manager UI.** Open `https://comfyui.example.com` (after
|
||||
basic-auth login), click **Manager**, then **Model Manager**, install
|
||||
from the catalogue.
|
||||
3. **Direct copy into the volume.** Useful if you already have the file
|
||||
locally:
|
||||
|
||||
Or open the ComfyUI native UI at `https://comfyui.example.com` (after
|
||||
basic-auth login), use the **Manager** button (added by ComfyUI-Manager),
|
||||
and install one through **Model Manager**.
|
||||
```sh
|
||||
docker run --rm -v ai-stack_comfyui-models:/models -v $PWD:/src alpine \
|
||||
cp /src/your-model.safetensors /models/checkpoints/
|
||||
```
|
||||
|
||||
## 5. First-user signup in Open WebUI
|
||||
|
||||
@@ -117,7 +142,7 @@ In Open WebUI: **Admin Panel -> Settings -> Images**.
|
||||
4. **ComfyUI Workflow Nodes** -> paste the contents of
|
||||
[`../../workflows/txt2img.nodes.json`](../../workflows/txt2img.nodes.json).
|
||||
5. **Default Model** -> the filename of the checkpoint you dropped in
|
||||
step 4 (e.g. `v1-5-pruned-emaonly.safetensors`).
|
||||
step 4 (e.g. `CyberRealisticXLPlay_V8.0_FP16.safetensors`).
|
||||
6. Save.
|
||||
|
||||
For image editing (img2img), scroll to the **Image Editing** section in
|
||||
@@ -132,6 +157,140 @@ Open WebUI submits the workflow to ComfyUI; the result drops back into
|
||||
the chat when KSampler finishes. To test img2img, attach an image and
|
||||
use the edit action.
|
||||
|
||||
## 8. (Optional) Install the smart-routing Tool
|
||||
|
||||
The image-button path always uses the admin's **Default Model**. To get
|
||||
per-prompt checkpoint routing — e.g. "draw me a cyberpunk city" picks
|
||||
CyberRealistic, "anthro fox warrior" picks one of the furry checkpoints —
|
||||
install the `smart_image_gen.py` Tool. It exposes two methods the LLM
|
||||
calls:
|
||||
|
||||
- **`generate_image`** for new images from scratch (txt2img).
|
||||
- **`edit_image`** for modifying an image the user attached to the
|
||||
chat. Two modes:
|
||||
- With `mask_text` — text-targeted inpainting via GroundingDINO+SAM
|
||||
(e.g. "the dog's collar"). Only the named region is repainted.
|
||||
- Without `mask_text` — full img2img which reimagines the whole
|
||||
image at the requested denoise.
|
||||
|
||||
Both auto-route to the right SDXL checkpoint per request.
|
||||
|
||||
> **First inpaint takes a few minutes**: SAM-HQ (~2.5 GB) and
|
||||
> GroundingDINO (~700 MB) auto-download into the `comfyui-models`
|
||||
> volume on the very first call to `edit_image` with `mask_text`.
|
||||
> Subsequent inpaints are instant.
|
||||
|
||||
1. **Workspace -> Tools -> +** (top-right).
|
||||
2. Paste the contents of
|
||||
[`openwebui-tools/smart_image_gen.py`](openwebui-tools/smart_image_gen.py).
|
||||
3. Save. Optionally adjust the Valves (ComfyUI URL, default steps, CFG,
|
||||
timeout) via the gear icon.
|
||||
4. **Workspace -> Models** (or pick an existing chat model) -> edit ->
|
||||
under **Tools**, enable `smart_image_gen` -> save.
|
||||
5. Make sure the model has **native function calling** enabled
|
||||
(Workspace -> Models -> the model -> Advanced Params -> Function
|
||||
Calling: Native). Mistral, Qwen, and Llama 3.1+ all support this.
|
||||
|
||||
In a chat with that model, ask for an image — "make me a photoreal
|
||||
portrait of a cyberpunk samurai" — the LLM should call
|
||||
`generate_image(prompt=..., style="photo")`. The status bar shows
|
||||
"Routing to photo (CyberRealisticXLPlay…)" while it generates.
|
||||
|
||||
If the LLM responds in text instead of calling the tool, install the
|
||||
**Image Studio** chat-model preset (next section) — a dedicated model
|
||||
with a system prompt that removes the ambiguity.
|
||||
|
||||
## 9. (Recommended) Install the Image Studio model preset
|
||||
|
||||
General-purpose chat models often "describe" an image in text instead
|
||||
of firing the `generate_image` tool, especially on conversational
|
||||
phrasing ("can you draw me…", "I'd love a picture of…"). The
|
||||
**Image Studio** preset wraps `mistral-nemo:12b` in a system prompt
|
||||
that mandates tool use — every message is treated as an image request.
|
||||
|
||||
Setup — two paths:
|
||||
|
||||
- **Import the JSON** (fast): Workspace → Models → Import →
|
||||
[`openwebui-models/image_studio.json`](openwebui-models/image_studio.json).
|
||||
- **Manual** (full control): walkthrough in
|
||||
[`openwebui-models/image_studio.md`](openwebui-models/image_studio.md).
|
||||
|
||||
Users then pick **Image Studio** from the chat-model dropdown when
|
||||
they want to generate or edit images.
|
||||
|
||||
**One required follow-up** after either install path: set a separate
|
||||
**Task Model** in Admin Settings → Interface → Task Model. Image
|
||||
Studio uses `tool_choice: required` to force tool calls, which means
|
||||
the same model can't produce the text responses Open WebUI needs for
|
||||
chat-title generation, tag suggestions, and autocomplete. Pick any
|
||||
non-Image-Studio model you have pulled (`mistral-nemo:12b`,
|
||||
`llama3.1:8b`, etc.) — see the
|
||||
[**Set a separate Task Model** section in image_studio.md](openwebui-models/image_studio.md#set-a-separate-task-model-required-after-install).
|
||||
|
||||
The preset ships with `vision: true` so users can attach images for
|
||||
editing even though `mistral-nemo:12b` isn't a vision model — see the
|
||||
[**Vision capability** section in image_studio.md](openwebui-models/image_studio.md#vision-capability)
|
||||
for the trade-offs and the upgrade path to a real vision LLM
|
||||
(`qwen2.5vl:7b`, `llama3.2-vision:11b`, etc.) if the LLM needs to
|
||||
actually see the image to write smarter edit instructions.
|
||||
|
||||
To extend (new checkpoint, new style):
|
||||
|
||||
- Add the filename to `comfyui-init-models.sh` so it gets pulled.
|
||||
- Add a key to the `CHECKPOINTS` dict in `smart_image_gen.py`.
|
||||
- Optionally add style-specific negatives to `NEGATIVES`.
|
||||
- Optionally add keyword routing rules to `ROUTING_RULES` for the
|
||||
auto-detect path.
|
||||
- Re-paste the Tool source in Workspace -> Tools.
|
||||
|
||||
## Mirroring models to S3
|
||||
|
||||
For models you want to pin against upstream changes (or pull faster
|
||||
from your own infra), mirror them to S3 once and have the
|
||||
deployment fetch from there.
|
||||
|
||||
### Create the mirror tarball
|
||||
|
||||
Run [`mirror-ollama-model.sh`](mirror-ollama-model.sh) on any machine
|
||||
that has the model pulled locally. It reads `~/.ollama/models/`,
|
||||
pulls the manifest's referenced blobs, and tars everything together:
|
||||
|
||||
```sh
|
||||
./mirror-ollama-model.sh huihui_ai/qwen3.5-abliterated:9b qwen3.5-abliterated-9b.tgz
|
||||
```
|
||||
|
||||
### Upload to S3
|
||||
|
||||
Whatever fits — `aws s3 cp`, `mc`, `rclone`, etc. The bucket needs
|
||||
to expose the file over HTTPS (public-read ACL on the object, a
|
||||
CloudFront distribution, R2 with public URLs, etc.):
|
||||
|
||||
```sh
|
||||
aws s3 cp qwen3.5-abliterated-9b.tgz s3://your-bucket/ollama-models/ --acl public-read
|
||||
```
|
||||
|
||||
### Wire the deployment to fetch from there
|
||||
|
||||
In `.env`:
|
||||
|
||||
```
|
||||
S3_OLLAMA_BASE=https://your-bucket.s3.amazonaws.com/ollama-models
|
||||
```
|
||||
|
||||
In `init-models.sh`, switch the affected models from `pull` to
|
||||
`s3_pull`:
|
||||
|
||||
```sh
|
||||
s3_pull "huihui_ai/qwen3.5-abliterated:9b" "qwen3.5-abliterated-9b.tgz"
|
||||
```
|
||||
|
||||
`docker compose up -d model-init` re-runs the init container; the
|
||||
script downloads the tarball, extracts into the `ollama-data` volume,
|
||||
and the running Ollama daemon picks it up on its next manifest scan.
|
||||
|
||||
If `S3_OLLAMA_BASE` isn't set, `s3_pull` transparently falls back to
|
||||
`ollama pull` — safe to commit `s3_pull` lines without S3 ready yet.
|
||||
|
||||
## Enabling Anubis (later)
|
||||
|
||||
The `anubis-owui` service is defined in compose but no Caddy site block
|
||||
@@ -154,11 +313,31 @@ provides a prompt, image, seed, etc. Each entry:
|
||||
|
||||
Recognised `type` strings (per Open WebUI source): `model`, `prompt`,
|
||||
`negative_prompt`, `width`, `height`, `n` (batch size), `steps`, `seed`,
|
||||
and `image` (img2img / edit only).
|
||||
and `image` (img2img / edit only). Notably **not** mappable: sampler,
|
||||
scheduler, CFG, CLIP skip, prompt prefix.
|
||||
|
||||
If you swap in a fancier workflow (SDXL, Flux, ControlNet, custom
|
||||
samplers, NL masking via SAM nodes, etc.), update the matching
|
||||
`*.nodes.json` so the node IDs and input keys still line up.
|
||||
This means the static workflow JSONs are tuned for a single checkpoint
|
||||
family at a time. The shipped defaults match
|
||||
`CyberRealisticXLPlay_V8.0_FP16.safetensors`
|
||||
(`dpmpp_2m_sde` / `karras` / CFG 4 / 28 steps / CLIP skip 1 / no prefix).
|
||||
**If you change the admin's Default Model to a different checkpoint
|
||||
family** (Pony, NoobAI, Illustrious, etc.), edit the workflow JSONs:
|
||||
|
||||
- `KSampler` node: change `sampler_name`, `scheduler`, `cfg`, `steps`
|
||||
- For checkpoints needing CLIP skip 2: add a `CLIPSetLastLayer` node and
|
||||
rewire `CLIPTextEncode` nodes through it (see
|
||||
[openwebui-tools/smart_image_gen.py](openwebui-tools/smart_image_gen.py)
|
||||
for the exact graph).
|
||||
- For Pony or NoobAI/Illustrious: the required quality-tag prefix
|
||||
(`score_9, score_8_up, ...` or `masterpiece, best quality, ...`) has
|
||||
to be typed by the user every time, since the workflow can't inject
|
||||
it. **For multi-checkpoint deployments, use the smart_image_gen Tool
|
||||
instead** — it handles per-checkpoint sampler / CFG / steps / CLIP
|
||||
skip / prefix automatically based on the LLM's `style` choice.
|
||||
|
||||
If you swap in a fancier workflow (Flux, ControlNet, NL masking via
|
||||
SAM nodes, etc.), update the matching `*.nodes.json` so the node IDs
|
||||
and input keys still line up.
|
||||
|
||||
## Common gotchas
|
||||
|
||||
|
||||
84
deployments/ai-stack/comfyui-init-models.sh
Normal file
84
deployments/ai-stack/comfyui-init-models.sh
Normal file
@@ -0,0 +1,84 @@
|
||||
#!/bin/sh
|
||||
# Preseed ComfyUI's models volume with checkpoints, VAEs, LoRAs, etc.
|
||||
# Runs once via the comfyui-model-init service (see docker-compose.yml).
|
||||
# Safe to re-run — already-present files are skipped.
|
||||
#
|
||||
# ComfyUI doesn't have a "pull" command of its own, so this is plain curl
|
||||
# against direct download URLs. For HuggingFace, the direct URL is:
|
||||
# https://huggingface.co/<repo>/resolve/main/<file>
|
||||
# For gated HF repos (Flux-dev, SD3, etc.), set HF_TOKEN in .env — the
|
||||
# script attaches it as a bearer token automatically.
|
||||
|
||||
set -e
|
||||
|
||||
apk add --no-cache curl >/dev/null
|
||||
|
||||
mkdir -p /models/checkpoints /models/vae /models/loras /models/controlnet \
|
||||
/models/clip /models/clip_vision /models/upscale_models /models/embeddings \
|
||||
/models/sams /models/grounding-dino
|
||||
|
||||
fetch() {
|
||||
dest="$1"; name="$2"; url="$3"
|
||||
target="/models/$dest/$name"
|
||||
|
||||
if [ -f "$target" ]; then
|
||||
echo "✓ $dest/$name already present"
|
||||
return
|
||||
fi
|
||||
|
||||
echo "→ Downloading $dest/$name…"
|
||||
mkdir -p "/models/$dest"
|
||||
|
||||
if [ -n "$HF_TOKEN" ] && echo "$url" | grep -q huggingface.co; then
|
||||
curl -fL -C - --retry 3 -H "Authorization: Bearer $HF_TOKEN" \
|
||||
-o "$target.partial" "$url"
|
||||
else
|
||||
curl -fL -C - --retry 3 -o "$target.partial" "$url"
|
||||
fi
|
||||
mv "$target.partial" "$target"
|
||||
}
|
||||
|
||||
# ─── Edit the list below to choose what gets preseeded ──────────────────────
|
||||
# Format: fetch <subdir under /models> <filename to save as> <direct URL>
|
||||
#
|
||||
# No checkpoints are downloaded by default — the deployment ships expecting
|
||||
# you to point at your own model mirror or the public examples below.
|
||||
# Whatever filename you pick should match the `ckpt_name` field in
|
||||
# workflows/txt2img.json and workflows/img2img.json (the shipped default
|
||||
# is CyberRealisticXLPlay_V8.0_FP16.safetensors); update either the
|
||||
# script or the workflows so they line up.
|
||||
|
||||
# Examples — uncomment what you want.
|
||||
|
||||
# SDXL Base 1.0 (~6.9 GB)
|
||||
# fetch checkpoints sd_xl_base_1.0.safetensors \
|
||||
# https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0/resolve/main/sd_xl_base_1.0.safetensors
|
||||
|
||||
# SDXL VAE (fixes washed-out colours on some SDXL checkpoints)
|
||||
# fetch vae sdxl_vae.safetensors \
|
||||
# https://huggingface.co/stabilityai/sdxl-vae/resolve/main/sdxl_vae.safetensors
|
||||
|
||||
# Flux.1-dev (~23 GB, gated — needs HF_TOKEN with access to black-forest-labs)
|
||||
# fetch checkpoints flux1-dev.safetensors \
|
||||
# https://huggingface.co/black-forest-labs/FLUX.1-dev/resolve/main/flux1-dev.safetensors
|
||||
|
||||
# 4x-UltraSharp upscaler
|
||||
# fetch upscale_models 4x-UltraSharp.pth \
|
||||
# https://huggingface.co/lokCX/4x-Ultrasharp/resolve/main/4x-UltraSharp.pth
|
||||
|
||||
# ─── Inpainting models (SAM-HQ + GroundingDINO) ─────────────────────────────
|
||||
# Required by the smart_image_gen Tool's edit_image with mask_text. ComfyUI
|
||||
# would auto-download these on first use, but that takes minutes and tends
|
||||
# to time out in-flight tool calls — preseeding here makes the first inpaint
|
||||
# instant.
|
||||
|
||||
fetch sams sam_hq_vit_h.pth \
|
||||
https://huggingface.co/lkeab/hq-sam/resolve/main/sam_hq_vit_h.pth
|
||||
|
||||
fetch grounding-dino groundingdino_swint_ogc.pth \
|
||||
https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/groundingdino_swint_ogc.pth
|
||||
|
||||
fetch grounding-dino GroundingDINO_SwinT_OGC.cfg.py \
|
||||
https://huggingface.co/ShilongLiu/GroundingDINO/resolve/main/GroundingDINO_SwinT_OGC.cfg.py
|
||||
|
||||
echo "Done."
|
||||
@@ -24,7 +24,7 @@ services:
|
||||
# Encrypt), reverse-proxies to the in-compose services by name.
|
||||
# ---------------------------------------------------------------------------
|
||||
caddy:
|
||||
image: caddy:2-alpine
|
||||
image: caddy:${CADDY_TAG:-2-alpine}
|
||||
container_name: caddy
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
@@ -49,7 +49,7 @@ services:
|
||||
# Ollama — LLM daemon, GPU-backed.
|
||||
# ---------------------------------------------------------------------------
|
||||
ollama:
|
||||
image: ollama/ollama:latest
|
||||
image: ollama/ollama:${OLLAMA_TAG:-latest}
|
||||
container_name: ollama
|
||||
restart: unless-stopped
|
||||
# 11434 only published if you want direct access from the VM host.
|
||||
@@ -60,8 +60,12 @@ services:
|
||||
- ollama-data:/root/.ollama
|
||||
environment:
|
||||
- OLLAMA_HOST=0.0.0.0:11434
|
||||
- OLLAMA_KEEP_ALIVE=30m
|
||||
- OLLAMA_MAX_LOADED_MODELS=2
|
||||
# KEEP_ALIVE=-1 holds loaded models in VRAM until evicted by another
|
||||
# load (vs the default 5m / our previous 30m which forces a reload
|
||||
# penalty on every cold use). Pair with MAX_LOADED_MODELS sized to
|
||||
# whatever fits in your GPU's VRAM — see README "VRAM sizing".
|
||||
- OLLAMA_KEEP_ALIVE=-1
|
||||
- OLLAMA_MAX_LOADED_MODELS=3
|
||||
- OLLAMA_FLASH_ATTENTION=1
|
||||
deploy:
|
||||
resources:
|
||||
@@ -79,8 +83,12 @@ services:
|
||||
|
||||
# One-shot model puller. Runs after ollama is healthy, pulls whatever
|
||||
# init-models.sh lists, exits. `restart: "no"` keeps it from looping.
|
||||
#
|
||||
# Models can come from registry.ollama.ai (default) or your own S3
|
||||
# mirror (set S3_OLLAMA_BASE in .env; create tarballs with
|
||||
# mirror-ollama-model.sh).
|
||||
model-init:
|
||||
image: ollama/ollama:latest
|
||||
image: ollama/ollama:${OLLAMA_TAG:-latest}
|
||||
container_name: ollama-model-init
|
||||
depends_on:
|
||||
ollama:
|
||||
@@ -90,6 +98,7 @@ services:
|
||||
- ./init-models.sh:/init-models.sh:ro
|
||||
environment:
|
||||
- OLLAMA_HOST=ollama:11434
|
||||
- S3_OLLAMA_BASE=${S3_OLLAMA_BASE:-}
|
||||
entrypoint: ["/bin/sh", "/init-models.sh"]
|
||||
restart: "no"
|
||||
|
||||
@@ -103,7 +112,7 @@ services:
|
||||
# (install via ComfyUI-Manager) instead of as a separate sidecar.
|
||||
# ---------------------------------------------------------------------------
|
||||
comfyui:
|
||||
image: git.anomalous.dev/alphacentri/comfyui-nvidia:${COMFYUI_IMAGE_TAG:-latest}
|
||||
image: git.anomalous.dev/alphacentri/comfyui-nvidia:${COMFYUI_IMAGE_TAG:-0.2.1}
|
||||
pull_policy: always
|
||||
container_name: comfyui
|
||||
restart: unless-stopped
|
||||
@@ -129,11 +138,28 @@ services:
|
||||
retries: 5
|
||||
start_period: 120s
|
||||
|
||||
# One-shot model puller for ComfyUI. Mounts the same models volume,
|
||||
# downloads whatever comfyui-init-models.sh lists, exits. ComfyUI doesn't
|
||||
# need to be running for this — files just land on the volume; ComfyUI
|
||||
# picks them up next time it scans (or on a restart).
|
||||
comfyui-model-init:
|
||||
image: alpine:${ALPINE_TAG:-3.20}
|
||||
container_name: comfyui-model-init
|
||||
volumes:
|
||||
- comfyui-models:/models
|
||||
- ./comfyui-init-models.sh:/init.sh:ro
|
||||
environment:
|
||||
# Optional — set in .env to download from gated HuggingFace repos
|
||||
# (Flux-dev, SD3, etc.). Leave empty for public-only.
|
||||
HF_TOKEN: "${HF_TOKEN:-}"
|
||||
entrypoint: ["/bin/sh", "/init.sh"]
|
||||
restart: "no"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Open WebUI — multi-user chat.
|
||||
# ---------------------------------------------------------------------------
|
||||
open-webui:
|
||||
image: ghcr.io/open-webui/open-webui:main
|
||||
image: ghcr.io/open-webui/open-webui:${OPEN_WEBUI_TAG:-main}
|
||||
container_name: open-webui
|
||||
restart: unless-stopped
|
||||
# ports: not published; Caddy fronts it
|
||||
@@ -175,7 +201,7 @@ services:
|
||||
# `open-webui:8080` → `anubis-owui:8923`.
|
||||
# ---------------------------------------------------------------------------
|
||||
anubis-owui:
|
||||
image: ghcr.io/techarohq/anubis:latest
|
||||
image: ghcr.io/techarohq/anubis:${ANUBIS_TAG:-latest}
|
||||
container_name: anubis-owui
|
||||
restart: unless-stopped
|
||||
environment:
|
||||
|
||||
@@ -3,20 +3,68 @@
|
||||
# Runs once via the model-init service (see docker-compose.yml). Safe to
|
||||
# re-run — already-present models are skipped.
|
||||
#
|
||||
# Add or remove tags to taste. The host needs enough disk for everything
|
||||
# listed; check sizes at https://ollama.com/library before adding.
|
||||
# Two pull paths:
|
||||
# - s3_pull — fetches a tarball from $S3_OLLAMA_BASE (your own mirror,
|
||||
# created by mirror-ollama-model.sh) and extracts into
|
||||
# Ollama's data dir. Faster + immune to upstream changes.
|
||||
# Falls back to ollama pull if S3_OLLAMA_BASE is unset.
|
||||
# - pull — standard `ollama pull` against registry.ollama.ai.
|
||||
|
||||
set -e
|
||||
|
||||
MODELS="dolphin3:8b llama3.1:8b ministral-3:8b mistral-nemo:12b qwen3.6:latest"
|
||||
# Make sure curl is available — ollama/ollama:latest doesn't always include
|
||||
# it, and s3_pull needs it. tar is in the base image.
|
||||
if ! command -v curl >/dev/null 2>&1; then
|
||||
apt-get update -qq && apt-get install -y -qq curl ca-certificates >/dev/null
|
||||
fi
|
||||
|
||||
for model in $MODELS; do
|
||||
if ollama list | awk 'NR>1 {print $1}' | grep -qx "$model"; then
|
||||
echo "✓ $model already present"
|
||||
else
|
||||
echo "→ Pulling $model…"
|
||||
ollama pull "$model"
|
||||
fi
|
||||
S3_OLLAMA_BASE="${S3_OLLAMA_BASE:-}"
|
||||
OLLAMA_DATA="/root/.ollama"
|
||||
|
||||
s3_pull() {
|
||||
name="$1"; archive="$2"
|
||||
if ollama list 2>/dev/null | awk 'NR>1 {print $1}' | grep -qx "$name"; then
|
||||
echo "✓ $name already present"
|
||||
return
|
||||
fi
|
||||
if [ -z "$S3_OLLAMA_BASE" ]; then
|
||||
echo "→ $name: S3_OLLAMA_BASE unset, falling back to ollama pull"
|
||||
ollama pull "$name"
|
||||
return
|
||||
fi
|
||||
url="${S3_OLLAMA_BASE%/}/$archive"
|
||||
echo "→ Downloading $name from $url…"
|
||||
curl -fL -C - --retry 3 -o "/tmp/$archive" "$url"
|
||||
tar -xzf "/tmp/$archive" -C "$OLLAMA_DATA/models/"
|
||||
rm -f "/tmp/$archive"
|
||||
echo "✓ $name installed (mirror)"
|
||||
}
|
||||
|
||||
pull() {
|
||||
name="$1"
|
||||
if ollama list 2>/dev/null | awk 'NR>1 {print $1}' | grep -qx "$name"; then
|
||||
echo "✓ $name already present"
|
||||
else
|
||||
echo "→ Pulling $name from registry.ollama.ai…"
|
||||
ollama pull "$name"
|
||||
fi
|
||||
}
|
||||
|
||||
# ─── S3-mirrored models ─────────────────────────────────────────────────────
|
||||
# These live in your own bucket. Create the tarballs once with
|
||||
# mirror-ollama-model.sh, upload to S3, then list them here.
|
||||
s3_pull "huihui_ai/qwen3.5-abliterated:9b" "qwen3.5-abliterated-9b.tgz"
|
||||
|
||||
# huihui_ai/qwen3-vl-abliterated — Qwen 3 VL base abliteration (different
|
||||
# fine-tune lineage than Qwen 3.5, so its tool-call template stays intact).
|
||||
# Used as the Image Studio dispatcher: vision-capable, calls tools
|
||||
# reliably, and doesn't refuse to dispatch on NSFW edit prompts. Pulled
|
||||
# from registry; no S3 mirror entry yet.
|
||||
pull "huihui_ai/qwen3-vl-abliterated:8b"
|
||||
|
||||
# ─── Direct registry pulls ──────────────────────────────────────────────────
|
||||
for model in dolphin3:8b llama3.1:8b ministral-3:8b mistral-nemo:12b qwen3.6:latest; do
|
||||
pull "$model"
|
||||
done
|
||||
|
||||
echo "Done."
|
||||
|
||||
66
deployments/ai-stack/mirror-ollama-model.sh
Normal file
66
deployments/ai-stack/mirror-ollama-model.sh
Normal file
@@ -0,0 +1,66 @@
|
||||
#!/bin/bash
|
||||
# Mirror an Ollama model into a portable tarball you can upload to S3
|
||||
# (or any HTTPS host) and re-fetch via init-models.sh's s3_pull.
|
||||
#
|
||||
# Run on any machine that already has the model pulled locally — the
|
||||
# script reads ~/.ollama/models/, parses the manifest to find the
|
||||
# referenced blobs, and tars them together.
|
||||
#
|
||||
# Usage: ./mirror-ollama-model.sh <model:tag> <output.tgz>
|
||||
# Example: ./mirror-ollama-model.sh huihui_ai/qwen3.5-abliterated:9b qwen3.5-abliterated-9b.tgz
|
||||
#
|
||||
# Upload the tarball to S3, then add to init-models.sh:
|
||||
# s3_pull "huihui_ai/qwen3.5-abliterated:9b" "qwen3.5-abliterated-9b.tgz"
|
||||
# and set S3_OLLAMA_BASE in .env to your bucket's HTTPS base URL.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
MODEL="${1:?Usage: $0 <model:tag> <output.tgz>}"
|
||||
OUT="${2:?Usage: $0 <model:tag> <output.tgz>}"
|
||||
|
||||
OLLAMA_HOME="${OLLAMA_HOME:-$HOME/.ollama}"
|
||||
MODELS="$OLLAMA_HOME/models"
|
||||
|
||||
if ! ollama list | awk 'NR>1 {print $1}' | grep -qx "$MODEL"; then
|
||||
echo "Model $MODEL not found locally; pulling first..."
|
||||
ollama pull "$MODEL"
|
||||
fi
|
||||
|
||||
# huihui_ai/qwen3.5-abliterated:9b → manifests/registry.ollama.ai/huihui_ai/qwen3.5-abliterated/9b
|
||||
ns_and_name="${MODEL%:*}"
|
||||
tag="${MODEL##*:}"
|
||||
manifest_rel="manifests/registry.ollama.ai/$ns_and_name/$tag"
|
||||
manifest_abs="$MODELS/$manifest_rel"
|
||||
|
||||
if [ ! -f "$manifest_abs" ]; then
|
||||
echo "ERROR: manifest not found at $manifest_abs" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Pull every sha256:* digest out of the manifest JSON. Each maps to
|
||||
# blobs/sha256-<hex>.
|
||||
blob_files=""
|
||||
for digest in $(grep -oE 'sha256:[a-f0-9]+' "$manifest_abs" | sort -u); do
|
||||
blob_rel="blobs/${digest/:/-}"
|
||||
if [ ! -f "$MODELS/$blob_rel" ]; then
|
||||
echo "WARNING: missing blob $blob_rel — skipping" >&2
|
||||
continue
|
||||
fi
|
||||
blob_files="$blob_files $blob_rel"
|
||||
done
|
||||
|
||||
count=$(echo "$blob_files" | wc -w | tr -d ' ')
|
||||
echo "Archiving manifest + $count blob(s)..."
|
||||
tar -czf "$OUT" -C "$MODELS" "$manifest_rel" $blob_files
|
||||
|
||||
size=$(du -h "$OUT" | cut -f1)
|
||||
echo "Done: $OUT ($size)"
|
||||
echo
|
||||
echo "Next:"
|
||||
echo " 1. Upload to your bucket, e.g."
|
||||
echo " aws s3 cp $OUT s3://YOUR-BUCKET/ollama-models/ --acl public-read"
|
||||
echo " (or whatever exposes it over HTTPS)"
|
||||
echo " 2. Set S3_OLLAMA_BASE in .env to the bucket's HTTPS base, e.g."
|
||||
echo " S3_OLLAMA_BASE=https://YOUR-BUCKET.s3.amazonaws.com/ollama-models"
|
||||
echo " 3. Add to init-models.sh:"
|
||||
echo " s3_pull \"$MODEL\" \"$(basename "$OUT")\""
|
||||
38
deployments/ai-stack/openwebui-models/image_studio.json
Normal file
38
deployments/ai-stack/openwebui-models/image_studio.json
Normal file
@@ -0,0 +1,38 @@
|
||||
[
|
||||
{
|
||||
"id": "image-studio",
|
||||
"base_model_id": "huihui_ai/qwen3-vl-abliterated:8b",
|
||||
"name": "Image Studio",
|
||||
"params": {
|
||||
"system": "/no_think\n\nYou are an image-tool dispatcher. You do not respond in prose. Every user message MUST result in exactly one tool call.\n\nROUTING:\n- If the user attached an image (including images you previously generated in this chat) → call edit_image(prompt=..., ...)\n- Otherwise → call generate_image(prompt=..., ...)\nBoth tools take `prompt` as the first argument — same name on both. Do NOT invent `edit_instruction`.\n\nFire the tool on the FIRST message, with no preamble. Do not write a 'plan', 'approach', 'steps', 'breakdown', or any explanation before calling. Do not ask clarifying questions. Do not say what you are about to do. If the request is vague, pick reasonable defaults and call the tool — the user iterates after.\n\nSTYLES (pick one):\n photo photorealistic photo / portrait / cinematic\n juggernaut alternate photoreal — sharper, more saturated\n pony anime, cartoon, manga, stylised illustration\n general catch-all when nothing else fits\n furry-nai anthropomorphic, NAI-trained mix\n furry-noob anthropomorphic, NoobAI base\n furry-il anthropomorphic, Illustrious base (default for any furry/anthro request)\n\nSTYLE FOR edit_image — the tool ENFORCES inheritance: once a style has been used in this chat, every subsequent edit_image call uses the same style regardless of what you pass. Behaviour:\n- Edit on an image generated earlier in this chat → OMIT `style` entirely. The tool will use the established style. Passing it is harmless but ignored.\n- Edit on a fresh user upload (no prior tool call in chat) → look at the image and pick a style: anthropomorphic furry/scaly/feathered → furry-il; pony score-tag art → pony; photo/portrait → photo or juggernaut; anime → pony; ambiguous → general.\n- Style cannot be changed mid-chat. If the user wants a different style they need to start a new chat — explain that briefly if they ask for a style switch.\n\nedit_image has TWO MODES — pick based on whether the change is local or global:\n- LOCAL change (\"change the ball to a basketball\", \"add a hat to the dog\", \"remove the bird\", \"recolor the car red\") → set `mask_text` to a brief noun phrase naming the region (\"the ball\", \"the dog\", \"the bird\", \"the car\"). Only that region is repainted; rest stays pixel-perfect.\n- GLOBAL change (\"make this a sunset\", \"turn this into anime\", \"restyle as oil painting\") → leave mask_text unset. The whole image is reimagined.\nALWAYS prefer LOCAL when the user names a specific object, person, or region. GLOBAL is only for whole-image style/lighting transformations.\n\nDenoise:\n- LOCAL (mask_text set): default 1.0. Drop to 0.6–0.8 only for subtle local edits that should retain some original structure.\n- GLOBAL (no mask_text): default 0.7. Use 0.3–0.5 for subtle restyle, 0.85–1.0 for radical reimagining.\n\nPick style for the DESIRED OUTPUT, not the input image.\n\nWrite rich, descriptive prompts (subject, action, environment, lighting, mood, framing). Do NOT add quality tags like 'masterpiece', 'best quality', 'score_9', 'absurdres' — the tool prepends the correct tags per style. Do NOT set sampler, CFG, steps, scheduler — the tool picks them.\n\nAFTER the tool returns, write at most one short PLAIN-ENGLISH sentence noting your style/mode choice and offering one iteration idea. The image is already shown to the user.\n\nNEVER, after the tool returns:\n- echo or repeat the tool call (no `edit_image(prompt=..., ...)`, no `<function=...>`, no JSON, no parameter listings)\n- describe what's in the image\n- list the arguments you used\n- enumerate styles, denoise, mask_text, etc.\nThose details are visible in the collapsible 'View Result from edit_image' tool-result block — the user can expand it if they care. Your follow-up message is for HUMAN conversation, not bookkeeping.",
|
||||
"temperature": 0.5,
|
||||
"top_p": 0.9,
|
||||
"function_calling": "native",
|
||||
"custom_params": {
|
||||
"tool_choice": "required",
|
||||
"enable_thinking": false
|
||||
}
|
||||
},
|
||||
"meta": {
|
||||
"profile_image_url": "/static/favicon.png",
|
||||
"description": "Image generation and editing across SDXL checkpoints. Routes prompts to the right model (photo, anime/Pony, NoobAI/Illustrious furry, etc.) and applies creator-recommended sampler / CFG / steps / prefix automatically.",
|
||||
"capabilities": {
|
||||
"vision": true,
|
||||
"usage": false,
|
||||
"citations": false
|
||||
},
|
||||
"tags": [
|
||||
{ "name": "image-gen" },
|
||||
{ "name": "comfyui" }
|
||||
],
|
||||
"toolIds": ["smart_image_gen"],
|
||||
"suggestion_prompts": [
|
||||
{ "content": "Generate a photorealistic portrait of a cyberpunk samurai at dusk." },
|
||||
{ "content": "Draw an anthropomorphic fox warrior in stylised anime art." },
|
||||
{ "content": "Make a pony-style illustration of a starry forest at night." }
|
||||
]
|
||||
},
|
||||
"access_control": null,
|
||||
"is_active": true
|
||||
}
|
||||
]
|
||||
254
deployments/ai-stack/openwebui-models/image_studio.md
Normal file
254
deployments/ai-stack/openwebui-models/image_studio.md
Normal file
@@ -0,0 +1,254 @@
|
||||
# Image Studio — dedicated image-generation chat model
|
||||
|
||||
A custom Open WebUI model preset that wraps a base LLM with a system
|
||||
prompt heavily biased toward calling the `smart_image_gen` tool. Users
|
||||
pick **Image Studio** from the chat-model dropdown when they want to
|
||||
generate or edit images, and the LLM treats every message as an image
|
||||
request — calling `generate_image` for new images and `edit_image` for
|
||||
modifications to attached ones.
|
||||
|
||||
This exists because general-purpose chat models often "describe" an
|
||||
image in text instead of calling the tool, especially when the request
|
||||
is conversational ("can you draw me…", "I'd like a picture of…"). A
|
||||
dedicated preset removes the ambiguity.
|
||||
|
||||
## Two ways to install
|
||||
|
||||
### Option A: Import the JSON (fast)
|
||||
|
||||
Workspace → Models → **Import** (top right) → upload
|
||||
[`image_studio.json`](image_studio.json).
|
||||
|
||||
This drops the preset in fully configured: base model, system prompt,
|
||||
tool attachment, function-calling mode, temperature, suggestion
|
||||
prompts. Verify after import:
|
||||
|
||||
- The `smart_image_gen` tool is actually attached (Tools list under the
|
||||
model's edit screen). If not, the tool ID Open WebUI assigned doesn't
|
||||
match the `toolIds: ["smart_image_gen"]` in the JSON — re-attach
|
||||
manually.
|
||||
- Base Model is set to `mistral-nemo:12b`. Adjust if you want a
|
||||
different LLM (Qwen3.6 or Llama 3.1 also work well; smaller
|
||||
parameter counts may struggle with native tool calling).
|
||||
|
||||
### Option B: Create manually (table below)
|
||||
|
||||
**Workspace → Models → +** (top right).
|
||||
|
||||
| Field | Value |
|
||||
| ----- | ----- |
|
||||
| Name | `Image Studio` |
|
||||
| Base Model | `huihui_ai/qwen3-vl-abliterated:8b` (Qwen 3 VL base, abliterated, vision + tools). Pull via `init-models.sh` first. The Qwen 3 VL fine-tune lineage isn't damaged by abliteration the way Qwen 3.5 is, so it both calls tools reliably AND won't refuse to dispatch on NSFW edit prompts. |
|
||||
| Description | `Image generation and routing across SDXL checkpoints.` |
|
||||
| System Prompt | Paste the block from [System prompt](#system-prompt) below. |
|
||||
| Tools | enable **only** `smart_image_gen` |
|
||||
|
||||
In the **Advanced Params** section:
|
||||
|
||||
| Field | Value |
|
||||
| ----- | ----- |
|
||||
| Function Calling | `Native` — works cleanly on `huihui_ai/qwen3-vl-abliterated:8b` once thinking is disabled (see Custom Parameters). Native gives you the structured "View Result from edit_image" blocks and "Thought for X seconds" tracing in the UI. |
|
||||
| Temperature | `0.5` (lower = more reliable tool-calling) |
|
||||
| Top P | `0.9` |
|
||||
| Context Length | leave default |
|
||||
| Custom Parameters | `tool_choice: required` (forces the model to call a tool every turn) **and** `enable_thinking: false` (disables Qwen's thinking mode at the API level — the `/no_think` system-prompt directive isn't honored by abliterated Qwen builds, but this server-side flag is). Both required for reliable behaviour on `huihui_ai/qwen3-vl-abliterated:8b`. |
|
||||
|
||||
Save. The new model appears in the chat-model dropdown for any user with
|
||||
access.
|
||||
|
||||
## System prompt
|
||||
|
||||
```
|
||||
/no_think
|
||||
|
||||
You are an image-tool dispatcher. You do not respond in prose. Every
|
||||
user message MUST result in exactly one tool call.
|
||||
|
||||
ROUTING:
|
||||
- If the user attached an image (including images you previously
|
||||
generated in this chat) → call edit_image(prompt=..., ...)
|
||||
- Otherwise → call generate_image(prompt=..., ...)
|
||||
Both tools take `prompt` as the first argument — same name on both.
|
||||
Do NOT invent `edit_instruction`.
|
||||
|
||||
Fire the tool on the FIRST message, with no preamble. Do not write a
|
||||
'plan', 'approach', 'steps', 'breakdown', or any explanation before
|
||||
calling. Do not ask clarifying questions. Do not say what you are
|
||||
about to do. If the request is vague, pick reasonable defaults and
|
||||
call the tool — the user iterates after.
|
||||
|
||||
STYLES (pick one):
|
||||
photo photorealistic photo / portrait / cinematic
|
||||
juggernaut alternate photoreal — sharper, more saturated
|
||||
pony anime, cartoon, manga, stylised illustration
|
||||
general catch-all when nothing else fits
|
||||
furry-nai anthropomorphic, NAI-trained mix
|
||||
furry-noob anthropomorphic, NoobAI base
|
||||
furry-il anthropomorphic, Illustrious base (default for any
|
||||
furry/anthro request)
|
||||
|
||||
STYLE FOR edit_image — the tool ENFORCES inheritance: once a style
|
||||
has been used in this chat, every subsequent edit_image call uses
|
||||
the same style regardless of what you pass. Behaviour:
|
||||
|
||||
- Edit on an image generated earlier in this chat → OMIT `style`
|
||||
entirely. The tool will use the established style. Passing it is
|
||||
harmless but ignored.
|
||||
- Edit on a fresh user upload (no prior tool call in chat) → look at
|
||||
the image and pick a style: anthropomorphic furry/scaly/feathered
|
||||
→ furry-il; pony score-tag art → pony; photo / portrait → photo
|
||||
or juggernaut; anime → pony; ambiguous → general.
|
||||
- Style cannot be changed mid-chat. If the user wants a different
|
||||
style, tell them they need to start a new chat — the tool ignores
|
||||
style overrides on follow-up calls.
|
||||
|
||||
edit_image has TWO MODES — pick based on whether the change is local
|
||||
or global:
|
||||
|
||||
- LOCAL ("change the ball to a basketball", "add a hat to the dog",
|
||||
"remove the bird", "recolor the car red") → set `mask_text` to a
|
||||
brief noun phrase naming the region ("the ball", "the dog", "the
|
||||
bird", "the car"). Only that region is repainted; rest stays
|
||||
pixel-perfect.
|
||||
- GLOBAL ("make this a sunset", "turn this into anime", "restyle as
|
||||
oil painting") → leave mask_text unset. The whole image is
|
||||
reimagined.
|
||||
|
||||
ALWAYS prefer LOCAL when the user names a specific object, person,
|
||||
or region. GLOBAL is only for whole-image style/lighting
|
||||
transformations.
|
||||
|
||||
Denoise:
|
||||
- LOCAL (mask_text set): default 1.0. Drop to 0.6–0.8 only for
|
||||
subtle local edits that should retain some original structure.
|
||||
- GLOBAL (no mask_text): default 0.7. Use 0.3–0.5 for subtle
|
||||
restyle, 0.85–1.0 for radical reimagining.
|
||||
|
||||
Pick style for the DESIRED OUTPUT, not the input image.
|
||||
|
||||
Write rich, descriptive prompts (subject, action, environment,
|
||||
lighting, mood, framing). Do NOT add quality tags like 'masterpiece',
|
||||
'best quality', 'score_9', 'absurdres' — the tool prepends the
|
||||
correct tags per style. Do NOT set sampler, CFG, steps, scheduler —
|
||||
the tool picks them.
|
||||
|
||||
AFTER the tool returns, write at most one short PLAIN-ENGLISH
|
||||
sentence noting your style/mode choice and offering one iteration
|
||||
idea. The image is already shown to the user.
|
||||
|
||||
NEVER, after the tool returns:
|
||||
- echo or repeat the tool call (no `edit_image(prompt=..., ...)`,
|
||||
no `<function=...>`, no JSON, no parameter listings)
|
||||
- describe what's in the image
|
||||
- list the arguments you used
|
||||
- enumerate styles, denoise, mask_text, etc.
|
||||
Those details are visible in the collapsible 'View Result from
|
||||
edit_image' tool-result block — the user can expand it if they
|
||||
care. Your follow-up message is for HUMAN conversation, not
|
||||
bookkeeping.
|
||||
```
|
||||
|
||||
The first line `/no_think` disables Qwen 3.x's reasoning phase. If
|
||||
your base model isn't Qwen 3, leaving it in is a no-op (other models
|
||||
ignore it). Drop it only if it actually causes problems.
|
||||
|
||||
## Set a separate Task Model (required after install)
|
||||
|
||||
`tool_choice: required` is what makes Image Studio reliably fire the
|
||||
tool, but it has a side effect: Open WebUI uses the same model with
|
||||
the same params for **title generation**, **tag generation**, and
|
||||
**autocomplete**. With every response forced to be a tool call, those
|
||||
text-only background tasks can't produce text, so chats stay named
|
||||
"New Chat" forever and tag suggestions go silent.
|
||||
|
||||
Fix: point Open WebUI at a different model for those tasks.
|
||||
|
||||
**Admin Settings → Interface → Task Model** → pick any of the
|
||||
non-Image-Studio models you have pulled. `mistral-nemo:12b`,
|
||||
`llama3.1:8b`, `qwen3.6:latest`, or `dolphin3:8b` all work. The Task
|
||||
Model only handles short background calls (titles, tags, autocomplete,
|
||||
search-query rewriting) — it doesn't need to be vision-capable or
|
||||
particularly large. Smaller is faster and cheaper.
|
||||
|
||||
Save. New Image Studio chats now get descriptive titles, tag
|
||||
suggestions return, and autocomplete lights up.
|
||||
|
||||
## Vision capability
|
||||
|
||||
The shipped preset sets `meta.capabilities.vision: true` so Open WebUI
|
||||
allows users to attach images to chats with this model. Two paths:
|
||||
|
||||
### Default — `huihui_ai/qwen3-vl-abliterated:8b`
|
||||
|
||||
The shipped preset uses huihui_ai's abliteration of Qwen 3 VL as
|
||||
the base — 8B params, vision-capable, native tool calling working,
|
||||
and won't refuse to dispatch the tool when the user's edit prompt
|
||||
is NSFW. Preseed via `init-models.sh`.
|
||||
|
||||
**Why not the Qwen 3.5 abliterated 9B (huihui_ai/qwen3.5-abliterated:9b)?**
|
||||
Same maintainer, but the abliteration on Qwen 3.5 mangles the
|
||||
function-call template, causing the model to either refuse to call
|
||||
tools or emit malformed `<function=...>` XML that Open WebUI's
|
||||
parser can't recognise. The Qwen 3 VL fine-tune lineage is
|
||||
different and doesn't take that damage from abliteration.
|
||||
|
||||
**Why not standard `qwen3.5:9b`?** The standard (non-abliterated)
|
||||
Qwen 3.5 calls tools reliably but its safety training refuses on
|
||||
many image edit prompts even though the LLM's only job is dispatch
|
||||
(the actual image content is generated by the SDXL checkpoint, which
|
||||
the LLM never sees). Abliterated VL gets us both reliable tool
|
||||
calling AND a cooperative dispatcher.
|
||||
|
||||
**Qwen 3.x quirk:** thinking mode is on by default and abliterated
|
||||
builds ignore the system-prompt `/no_think` directive — the model
|
||||
emits its tool call inside a thinking block that the parser treats
|
||||
as final response text instead of a real tool invocation. The
|
||||
shipped preset sets `enable_thinking: false` in `custom_params`,
|
||||
which Ollama enforces server-side and the model can't ignore. Don't
|
||||
remove it.
|
||||
|
||||
### Alternatives
|
||||
|
||||
If Qwen 3.5 isn't a fit (size, language preferences, abliteration
|
||||
caveats), other vision-capable Ollama tags worth trying:
|
||||
|
||||
- `qwen2.5vl:7b` — smaller, no thinking mode, very reliable tool-caller
|
||||
- `llama3.2-vision:11b` — Meta's vision variant, ~7 GB
|
||||
- `minicpm-v:8b` — fast, capable
|
||||
|
||||
To swap, change `base_model_id` in `image_studio.json` (or the Base
|
||||
Model field if you imported manually) and pull the model via
|
||||
`init-models.sh` or the Open WebUI model UI.
|
||||
|
||||
### Non-vision base model
|
||||
|
||||
If you'd rather use a text-only LLM (e.g. `mistral-nemo:12b`),
|
||||
keep `vision: true` in the preset so Open WebUI still permits image
|
||||
attachments; the image flows through to `edit_image` via
|
||||
`__messages__` / `__files__` and ComfyUI does the visual work. The
|
||||
LLM can't see the image, but for explicit edit instructions ("change
|
||||
the background to a sunset") that doesn't matter.
|
||||
|
||||
## Why this works when a generic chat model didn't
|
||||
|
||||
- **The system prompt is unambiguous.** No room for the model to
|
||||
decide "I'll just describe it in text instead."
|
||||
- **Only one tool is attached.** No competing tools to choose between.
|
||||
- **Function Calling: Default** is the safer choice for Qwen 3.x
|
||||
abliterated. Native mode expects the parser to recognise the
|
||||
model's structured tool-call format, which currently leaks Qwen
|
||||
3.5's `<function=...><parameter=...>` XML to chat as plain text on
|
||||
the published Open WebUI / Ollama versions. Default mode uses Open
|
||||
WebUI's own prompt-injection wrapper that round-trips reliably.
|
||||
Try Native only after swapping the base model to one known to work
|
||||
end-to-end (mistral-nemo, qwen2.5vl).
|
||||
- **Lower temperature.** Tool calling is more reliable with less
|
||||
sampling randomness.
|
||||
|
||||
## Iterating on the system prompt
|
||||
|
||||
If users ask for things you didn't anticipate (specific aspect ratios,
|
||||
multi-image batches, particular checkpoints not in the routing rules),
|
||||
edit the system prompt above and re-paste into the Workspace → Models
|
||||
entry. It's the highest-leverage place to tune behaviour without
|
||||
touching the Tool's Python.
|
||||
1024
deployments/ai-stack/openwebui-tools/smart_image_gen.py
Normal file
1024
deployments/ai-stack/openwebui-tools/smart_image_gen.py
Normal file
File diff suppressed because it is too large
Load Diff
612
deployments/ai-stack/openwebui-tools/smart_image_pipe.py
Normal file
612
deployments/ai-stack/openwebui-tools/smart_image_pipe.py
Normal file
@@ -0,0 +1,612 @@
|
||||
"""
|
||||
title: Smart Image Studio (Pipe)
|
||||
author: ai-stack
|
||||
version: 0.1.2
|
||||
description: Deterministic image-gen / edit / inpaint pipe — no LLM in the
|
||||
loop for the routing decision. Registers as a model in the chat-model
|
||||
dropdown ('Image Studio (Pipe)'). Reads the user's message + attached
|
||||
image (if any), routes via regex, calls ComfyUI directly, returns the
|
||||
image. Use when LLM-with-Tool tool-calling is leaking the call as text
|
||||
(the abliterated Qwen 3.5 / Open WebUI parser interop bug).
|
||||
required_open_webui_version: 0.5.0
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
import inspect
|
||||
import io
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
import uuid
|
||||
from typing import Awaitable, Callable, Literal, Optional
|
||||
|
||||
import aiohttp
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
# Open WebUI runtime imports — same defensive guard as the sibling Tool.
|
||||
try:
|
||||
from fastapi import UploadFile
|
||||
from open_webui.models.chats import Chats
|
||||
from open_webui.models.files import Files
|
||||
from open_webui.models.users import Users
|
||||
from open_webui.routers.files import upload_file_handler
|
||||
|
||||
_OPENWEBUI_RUNTIME = True
|
||||
except ImportError:
|
||||
_OPENWEBUI_RUNTIME = False
|
||||
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
# Per-style settings — kept in sync with smart_image_gen.py. If you change
|
||||
# checkpoint filenames in comfyui-init-models.sh, update both files.
|
||||
# ─────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
STYLES = {
|
||||
"photo": {
|
||||
"ckpt": "CyberRealisticXLPlay_V8.0_FP16.safetensors",
|
||||
"sampler": "dpmpp_2m_sde",
|
||||
"scheduler": "karras",
|
||||
"cfg": 4.0, "steps": 28, "clip_skip": 1,
|
||||
"prefix": "",
|
||||
"negative": (
|
||||
"cartoon, drawing, illustration, anime, manga, painting, sketch, "
|
||||
"render, 3d, cgi, plastic skin, oversaturated, "
|
||||
"lowres, blurry, jpeg artifacts, low quality, worst quality, "
|
||||
"bad anatomy, deformed, extra fingers, missing fingers, "
|
||||
"watermark, signature, text, logo"
|
||||
),
|
||||
},
|
||||
"juggernaut": {
|
||||
"ckpt": "Juggernaut-XL_v9_RunDiffusionPhoto_v2.safetensors",
|
||||
"sampler": "dpmpp_2m_sde",
|
||||
"scheduler": "karras",
|
||||
"cfg": 4.5, "steps": 35, "clip_skip": 1,
|
||||
"prefix": "",
|
||||
"negative": (
|
||||
"cartoon, drawing, illustration, anime, painting, sketch, render, "
|
||||
"3d, cgi, plastic skin, washed out, "
|
||||
"lowres, blurry, jpeg artifacts, low quality, worst quality, "
|
||||
"bad anatomy, deformed, extra fingers, missing fingers, "
|
||||
"watermark, signature, text, logo"
|
||||
),
|
||||
},
|
||||
"pony": {
|
||||
"ckpt": "ponyDiffusionV6XL_v6StartWithThisOne.safetensors",
|
||||
"sampler": "euler_ancestral",
|
||||
"scheduler": "normal",
|
||||
"cfg": 7.5, "steps": 25, "clip_skip": 2,
|
||||
"prefix": "score_9, score_8_up, score_7_up, score_6_up, score_5_up, score_4_up, ",
|
||||
"negative": (
|
||||
"score_6, score_5, score_4, "
|
||||
"worst quality, low quality, lowres, blurry, jpeg artifacts, "
|
||||
"bad anatomy, bad hands, extra digit, fewer digits, "
|
||||
"deformed, ugly, censored, monochrome, "
|
||||
"watermark, signature, text, artist name"
|
||||
),
|
||||
},
|
||||
"general": {
|
||||
"ckpt": "talmendoxlSDXL_v11Beta.safetensors",
|
||||
"sampler": "dpmpp_2m",
|
||||
"scheduler": "karras",
|
||||
"cfg": 8.0, "steps": 30, "clip_skip": 2,
|
||||
"prefix": "",
|
||||
"negative": (
|
||||
"lowres, blurry, jpeg artifacts, low quality, worst quality, "
|
||||
"bad anatomy, deformed, ugly, watermark, signature, text"
|
||||
),
|
||||
},
|
||||
"furry-nai": {
|
||||
"ckpt": "reedFURRYMixSDXL_v23nai.safetensors",
|
||||
"sampler": "euler_ancestral",
|
||||
"scheduler": "normal",
|
||||
"cfg": 5.0, "steps": 30, "clip_skip": 2,
|
||||
"prefix": (
|
||||
"masterpiece, best quality, high quality, detailed eyes, "
|
||||
"highres, absurdres, furry, "
|
||||
),
|
||||
"negative": (
|
||||
"human, realistic, photorealistic, 3d, cgi, "
|
||||
"worst quality, low quality, lowres, blurry, jpeg artifacts, "
|
||||
"bad anatomy, extra digit, fewer digits, deformed, ugly, "
|
||||
"watermark, signature, text"
|
||||
),
|
||||
},
|
||||
"furry-noob": {
|
||||
"ckpt": "indigoVoidFurryFusedXL_noobaiV32.safetensors",
|
||||
"sampler": "euler_ancestral",
|
||||
"scheduler": "normal",
|
||||
"cfg": 4.5, "steps": 20, "clip_skip": 2,
|
||||
"prefix": (
|
||||
"masterpiece, best quality, perfect quality, absurdres, newest, "
|
||||
"very aesthetic, vibrant colors, "
|
||||
),
|
||||
"negative": (
|
||||
"human, realistic, photorealistic, 3d, cgi, shiny skin, "
|
||||
"worst quality, low quality, lowres, blurry, jpeg artifacts, "
|
||||
"bad anatomy, bad hands, mutated hands, "
|
||||
"watermark, signature, text"
|
||||
),
|
||||
},
|
||||
"furry-il": {
|
||||
"ckpt": "novaFurryXL_ilV170.safetensors",
|
||||
"sampler": "euler_ancestral",
|
||||
"scheduler": "normal",
|
||||
"cfg": 4.0, "steps": 30, "clip_skip": 2,
|
||||
"prefix": (
|
||||
"masterpiece, best quality, amazing quality, very aesthetic, "
|
||||
"ultra-detailed, absurdres, newest, furry, anthro, "
|
||||
),
|
||||
"negative": (
|
||||
"human, multiple tails, modern, recent, old, oldest, graphic, "
|
||||
"cartoon, painting, deformed, mutated, ugly, lowres, "
|
||||
"bad anatomy, bad hands, missing fingers, extra digits, "
|
||||
"worst quality, bad quality, sketch, jpeg artifacts, "
|
||||
"signature, watermark, text, simple background"
|
||||
),
|
||||
},
|
||||
}
|
||||
|
||||
DEFAULT_STYLE = "furry-il"
|
||||
|
||||
ROUTING_RULES = [
|
||||
(re.compile(r"\bscore_\d", re.I), "pony"),
|
||||
(re.compile(r"\bpony\b", re.I), "pony"),
|
||||
(re.compile(r"\b(noobai|noob)\b", re.I), "furry-noob"),
|
||||
(re.compile(r"\b(illustrious|ilxl)\b", re.I), "furry-il"),
|
||||
(re.compile(r"\b(furry|anthro|feral|kemono|fursona|species)\b", re.I), "furry-il"),
|
||||
(re.compile(r"\b(juggernaut)\b", re.I), "juggernaut"),
|
||||
(re.compile(r"\b(photo|photograph|realistic|portrait|selfie|cinematic)\b", re.I), "photo"),
|
||||
(re.compile(r"\b(anime|manga|2d|illustration)\b", re.I), "pony"),
|
||||
]
|
||||
|
||||
# Phrases that imply local-only editing → triggers inpaint mode and
|
||||
# pulls out a noun phrase as the mask text.
|
||||
INPAINT_PATTERNS = [
|
||||
re.compile(r"\b(?:change|recolor|edit|modify|replace|remove|delete|add)\s+(?:the|that|her|his|its)\s+([\w\s'-]{2,30}?)(?:\s+(?:to|into|with|so|that|and|,|\.)|$)", re.I),
|
||||
re.compile(r"\b(?:make|turn)\s+(?:the|that|her|his|its)\s+([\w\s'-]{2,30}?)\s+(?:bigger|smaller|larger|wider|taller|shorter|longer|brighter|darker|red|blue|green|yellow|orange|purple|pink|black|white|gold)", re.I),
|
||||
re.compile(r"\b(?:only|just)\s+(?:the|change the|edit the)\s+([\w\s'-]{2,30}?)(?:\s+|$)", re.I),
|
||||
]
|
||||
|
||||
|
||||
def _route_style(prompt: str) -> str:
|
||||
for pattern, style in ROUTING_RULES:
|
||||
if pattern.search(prompt):
|
||||
return style
|
||||
return DEFAULT_STYLE
|
||||
|
||||
|
||||
def _detect_mask_text(prompt: str) -> Optional[str]:
|
||||
"""Pull a noun phrase out of edit-style instructions for inpaint."""
|
||||
for pattern in INPAINT_PATTERNS:
|
||||
m = pattern.search(prompt)
|
||||
if m:
|
||||
obj = m.group(1).strip().rstrip(",.").strip()
|
||||
if obj:
|
||||
return f"the {obj}"
|
||||
return None
|
||||
|
||||
|
||||
def _inherited_style(messages) -> Optional[str]:
|
||||
"""Best-effort: read prior assistant message metadata for a style hint."""
|
||||
if not messages:
|
||||
return None
|
||||
for msg in reversed(messages):
|
||||
if not isinstance(msg, dict):
|
||||
continue
|
||||
# Look for a "style: X" comment in the assistant's previous text
|
||||
if msg.get("role") == "assistant":
|
||||
content = msg.get("content")
|
||||
if isinstance(content, str):
|
||||
m = re.search(r"\bstyle[:=]\s*([\w\-]+)", content)
|
||||
if m and m.group(1) in STYLES:
|
||||
return m.group(1)
|
||||
return None
|
||||
|
||||
|
||||
def _seed_value(seed: int) -> int:
|
||||
return seed if seed > 0 else int(time.time() * 1000) % (2**31)
|
||||
|
||||
|
||||
def _build_txt2img(positive: str, negative: str, settings: dict,
|
||||
width: int, height: int, seed: int) -> dict:
|
||||
return {
|
||||
"3": {"class_type": "KSampler", "inputs": {
|
||||
"seed": _seed_value(seed),
|
||||
"steps": settings["steps"], "cfg": settings["cfg"],
|
||||
"sampler_name": settings["sampler"], "scheduler": settings["scheduler"],
|
||||
"denoise": 1.0,
|
||||
"model": ["4", 0], "positive": ["6", 0],
|
||||
"negative": ["7", 0], "latent_image": ["5", 0],
|
||||
}},
|
||||
"4": {"class_type": "CheckpointLoaderSimple", "inputs": {"ckpt_name": settings["ckpt"]}},
|
||||
"5": {"class_type": "EmptyLatentImage",
|
||||
"inputs": {"width": width, "height": height, "batch_size": 1}},
|
||||
"6": {"class_type": "CLIPTextEncode", "inputs": {"text": positive, "clip": ["10", 0]}},
|
||||
"7": {"class_type": "CLIPTextEncode", "inputs": {"text": negative, "clip": ["10", 0]}},
|
||||
"8": {"class_type": "VAEDecode", "inputs": {"samples": ["3", 0], "vae": ["4", 2]}},
|
||||
"9": {"class_type": "SaveImage",
|
||||
"inputs": {"filename_prefix": "smartpipe", "images": ["8", 0]}},
|
||||
"10": {"class_type": "CLIPSetLastLayer",
|
||||
"inputs": {"stop_at_clip_layer": -settings["clip_skip"], "clip": ["4", 1]}},
|
||||
}
|
||||
|
||||
|
||||
def _build_img2img(positive: str, negative: str, settings: dict,
|
||||
image_filename: str, denoise: float, seed: int) -> dict:
|
||||
return {
|
||||
"3": {"class_type": "KSampler", "inputs": {
|
||||
"seed": _seed_value(seed),
|
||||
"steps": settings["steps"], "cfg": settings["cfg"],
|
||||
"sampler_name": settings["sampler"], "scheduler": settings["scheduler"],
|
||||
"denoise": denoise,
|
||||
"model": ["4", 0], "positive": ["6", 0],
|
||||
"negative": ["7", 0], "latent_image": ["11", 0],
|
||||
}},
|
||||
"4": {"class_type": "CheckpointLoaderSimple", "inputs": {"ckpt_name": settings["ckpt"]}},
|
||||
"6": {"class_type": "CLIPTextEncode", "inputs": {"text": positive, "clip": ["10", 0]}},
|
||||
"7": {"class_type": "CLIPTextEncode", "inputs": {"text": negative, "clip": ["10", 0]}},
|
||||
"8": {"class_type": "VAEDecode", "inputs": {"samples": ["3", 0], "vae": ["4", 2]}},
|
||||
"9": {"class_type": "SaveImage",
|
||||
"inputs": {"filename_prefix": "smartpipe", "images": ["8", 0]}},
|
||||
"10": {"class_type": "CLIPSetLastLayer",
|
||||
"inputs": {"stop_at_clip_layer": -settings["clip_skip"], "clip": ["4", 1]}},
|
||||
"11": {"class_type": "VAEEncode", "inputs": {"pixels": ["12", 0], "vae": ["4", 2]}},
|
||||
"12": {"class_type": "LoadImage", "inputs": {"image": image_filename}},
|
||||
}
|
||||
|
||||
|
||||
def _build_inpaint(positive: str, negative: str, settings: dict,
|
||||
image_filename: str, mask_text: str,
|
||||
denoise: float, seed: int) -> dict:
|
||||
return {
|
||||
"3": {"class_type": "KSampler", "inputs": {
|
||||
"seed": _seed_value(seed),
|
||||
"steps": settings["steps"], "cfg": settings["cfg"],
|
||||
"sampler_name": settings["sampler"], "scheduler": settings["scheduler"],
|
||||
"denoise": denoise,
|
||||
"model": ["4", 0], "positive": ["6", 0],
|
||||
"negative": ["7", 0], "latent_image": ["13", 0],
|
||||
}},
|
||||
"4": {"class_type": "CheckpointLoaderSimple", "inputs": {"ckpt_name": settings["ckpt"]}},
|
||||
"6": {"class_type": "CLIPTextEncode", "inputs": {"text": positive, "clip": ["10", 0]}},
|
||||
"7": {"class_type": "CLIPTextEncode", "inputs": {"text": negative, "clip": ["10", 0]}},
|
||||
"8": {"class_type": "VAEDecode", "inputs": {"samples": ["3", 0], "vae": ["4", 2]}},
|
||||
"9": {"class_type": "SaveImage",
|
||||
"inputs": {"filename_prefix": "smartpipe", "images": ["8", 0]}},
|
||||
"10": {"class_type": "CLIPSetLastLayer",
|
||||
"inputs": {"stop_at_clip_layer": -settings["clip_skip"], "clip": ["4", 1]}},
|
||||
"11": {"class_type": "VAEEncode", "inputs": {"pixels": ["12", 0], "vae": ["4", 2]}},
|
||||
"12": {"class_type": "LoadImage", "inputs": {"image": image_filename}},
|
||||
"13": {"class_type": "SetLatentNoiseMask",
|
||||
"inputs": {"samples": ["11", 0], "mask": ["17", 0]}},
|
||||
"14": {"class_type": "SAMModelLoader (segment anything)",
|
||||
"inputs": {"model_name": "sam_hq_vit_h (2.57GB)"}},
|
||||
"15": {"class_type": "GroundingDinoModelLoader (segment anything)",
|
||||
"inputs": {"model_name": "GroundingDINO_SwinT_OGC (694MB)"}},
|
||||
"16": {"class_type": "GroundingDinoSAMSegment (segment anything)",
|
||||
"inputs": {
|
||||
"sam_model": ["14", 0], "grounding_dino_model": ["15", 0],
|
||||
"image": ["12", 0], "prompt": mask_text, "threshold": 0.3,
|
||||
}},
|
||||
"17": {"class_type": "GrowMask",
|
||||
"inputs": {"mask": ["16", 1], "expand": 12, "tapered_corners": True}},
|
||||
}
|
||||
|
||||
|
||||
_FILE_URL_ID_RE = re.compile(r"/(?:api/v1/)?files/([0-9a-fA-F-]{8,})(?:/content)?")
|
||||
|
||||
|
||||
def _file_dict_is_image(f: dict) -> bool:
|
||||
ftype = (f.get("type") or "").lower()
|
||||
fname = (f.get("name") or f.get("filename") or "").lower()
|
||||
return "image" in ftype or fname.endswith((".png", ".jpg", ".jpeg", ".webp"))
|
||||
|
||||
|
||||
async def _read_file_dict(f: dict) -> Optional[bytes]:
|
||||
for path_key in ("path", "filepath", "file_path"):
|
||||
path = f.get(path_key)
|
||||
if path:
|
||||
try:
|
||||
with open(path, "rb") as fh:
|
||||
return fh.read()
|
||||
except OSError:
|
||||
pass
|
||||
candidate_ids = []
|
||||
if f.get("id"):
|
||||
candidate_ids.append(f["id"])
|
||||
url = f.get("url")
|
||||
if url:
|
||||
m = _FILE_URL_ID_RE.search(url)
|
||||
if m:
|
||||
candidate_ids.append(m.group(1))
|
||||
if _OPENWEBUI_RUNTIME:
|
||||
for fid in candidate_ids:
|
||||
try:
|
||||
file_model = await Files.get_file_by_id(fid)
|
||||
if file_model is None:
|
||||
continue
|
||||
path = getattr(file_model, "path", None)
|
||||
if not path:
|
||||
meta = getattr(file_model, "meta", None) or {}
|
||||
path = meta.get("path") if isinstance(meta, dict) else getattr(meta, "path", None)
|
||||
if path:
|
||||
try:
|
||||
with open(path, "rb") as fh:
|
||||
return fh.read()
|
||||
except OSError:
|
||||
pass
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
async def _extract_attached_image(files, messages, metadata, session) -> Optional[bytes]:
|
||||
# 1. Inline data URIs
|
||||
for msg in reversed(messages or []):
|
||||
content = msg.get("content") if isinstance(msg, dict) else None
|
||||
if isinstance(content, list):
|
||||
for block in content:
|
||||
if not isinstance(block, dict) or block.get("type") != "image_url":
|
||||
continue
|
||||
url = (block.get("image_url") or {}).get("url", "")
|
||||
if url.startswith("data:image"):
|
||||
try:
|
||||
return base64.b64decode(url.split(",", 1)[1])
|
||||
except Exception:
|
||||
pass
|
||||
# 2. messages[].files
|
||||
for msg in reversed(messages or []):
|
||||
if not isinstance(msg, dict):
|
||||
continue
|
||||
for f in (msg.get("files") or []):
|
||||
if isinstance(f, dict) and _file_dict_is_image(f):
|
||||
data = await _read_file_dict(f)
|
||||
if data is not None:
|
||||
return data
|
||||
# 3. __files__
|
||||
for f in files or []:
|
||||
if isinstance(f, dict) and _file_dict_is_image(f):
|
||||
data = await _read_file_dict(f)
|
||||
if data is not None:
|
||||
return data
|
||||
# 4. DB lookup (assistant-emitted files often only land here)
|
||||
if _OPENWEBUI_RUNTIME and metadata:
|
||||
chat_id = metadata.get("chat_id")
|
||||
if chat_id:
|
||||
try:
|
||||
chat = await Chats.get_chat_by_id(chat_id)
|
||||
chat_data = getattr(chat, "chat", None) if chat else None
|
||||
chat_messages = (chat_data or {}).get("messages", []) if isinstance(chat_data, dict) else []
|
||||
for msg in reversed(chat_messages):
|
||||
for f in (msg.get("files") or []) if isinstance(msg, dict) else []:
|
||||
if isinstance(f, dict) and _file_dict_is_image(f):
|
||||
data = await _read_file_dict(f)
|
||||
if data is not None:
|
||||
return data
|
||||
except Exception:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
async def _upload_to_comfyui(session, base, raw) -> Optional[str]:
|
||||
name = f"smartpipe_{uuid.uuid4().hex[:12]}.png"
|
||||
form = aiohttp.FormData()
|
||||
form.add_field("image", raw, filename=name, content_type="image/png")
|
||||
form.add_field("overwrite", "true")
|
||||
async with session.post(f"{base}/upload/image", data=form) as resp:
|
||||
if resp.status != 200:
|
||||
return None
|
||||
return (await resp.json()).get("name", name)
|
||||
|
||||
|
||||
async def _push_image_to_chat(raw, prefix, request, user_dict, metadata, event_emitter) -> bool:
|
||||
if not (_OPENWEBUI_RUNTIME and request and user_dict and event_emitter):
|
||||
return False
|
||||
try:
|
||||
user = await Users.get_user_by_id(user_dict.get("id"))
|
||||
if not user:
|
||||
return False
|
||||
upload = UploadFile(
|
||||
file=io.BytesIO(raw),
|
||||
filename=f"{prefix}_{uuid.uuid4().hex[:8]}.png",
|
||||
headers={"content-type": "image/png"},
|
||||
)
|
||||
result = upload_file_handler(
|
||||
request=request, file=upload,
|
||||
metadata={"chat_id": (metadata or {}).get("chat_id"),
|
||||
"message_id": (metadata or {}).get("message_id")},
|
||||
process=False, user=user,
|
||||
)
|
||||
file_item = await result if inspect.iscoroutine(result) else result
|
||||
url = request.app.url_path_for("get_file_content_by_id", id=file_item.id)
|
||||
await event_emitter({
|
||||
"type": "files",
|
||||
"data": {"files": [{"type": "image", "url": url}]},
|
||||
})
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
async def _submit_and_fetch(session, base, workflow, timeout_seconds, emit, settings):
|
||||
SAVE_NODE_ID = "9"
|
||||
client_id = str(uuid.uuid4())
|
||||
async with session.post(
|
||||
f"{base}/prompt", json={"prompt": workflow, "client_id": client_id}
|
||||
) as resp:
|
||||
if resp.status != 200:
|
||||
return None, f"ComfyUI rejected the prompt: {resp.status} {await resp.text()}"
|
||||
prompt_id = (await resp.json()).get("prompt_id")
|
||||
if not prompt_id:
|
||||
return None, "ComfyUI didn't return a prompt_id."
|
||||
|
||||
await emit(
|
||||
f"Sampling — {settings['sampler']}/{settings['scheduler']}, "
|
||||
f"CFG {settings['cfg']}, {settings['steps']} steps"
|
||||
)
|
||||
deadline = time.time() + timeout_seconds
|
||||
output_images: list = []
|
||||
while time.time() < deadline:
|
||||
await asyncio.sleep(1.5)
|
||||
async with session.get(f"{base}/history/{prompt_id}") as resp:
|
||||
if resp.status != 200:
|
||||
continue
|
||||
history = await resp.json()
|
||||
if prompt_id in history:
|
||||
outputs = history[prompt_id].get("outputs", {}) or {}
|
||||
save_imgs = (outputs.get(SAVE_NODE_ID) or {}).get("images", [])
|
||||
if save_imgs:
|
||||
output_images.extend(save_imgs)
|
||||
if not output_images:
|
||||
for node_out in outputs.values():
|
||||
output_images.extend(node_out.get("images", []))
|
||||
if output_images:
|
||||
break
|
||||
|
||||
if not output_images:
|
||||
return None, f"Timed out after {timeout_seconds}s waiting for image."
|
||||
|
||||
img = output_images[0]
|
||||
params = {
|
||||
"filename": img["filename"],
|
||||
"subfolder": img.get("subfolder", ""),
|
||||
"type": img.get("type", "output"),
|
||||
}
|
||||
async with session.get(f"{base}/view", params=params) as resp:
|
||||
if resp.status != 200:
|
||||
return None, f"Failed to fetch image: {resp.status}"
|
||||
return await resp.read(), None
|
||||
|
||||
|
||||
def _extract_user_text(body: dict) -> str:
|
||||
"""Pull the latest user message's text content."""
|
||||
messages = body.get("messages", [])
|
||||
for msg in reversed(messages):
|
||||
if not isinstance(msg, dict) or msg.get("role") != "user":
|
||||
continue
|
||||
content = msg.get("content")
|
||||
if isinstance(content, str):
|
||||
return content.strip()
|
||||
if isinstance(content, list):
|
||||
parts = []
|
||||
for block in content:
|
||||
if isinstance(block, dict) and block.get("type") == "text":
|
||||
parts.append(block.get("text", ""))
|
||||
return " ".join(parts).strip()
|
||||
return ""
|
||||
|
||||
|
||||
class Pipe:
|
||||
class Valves(BaseModel):
|
||||
COMFYUI_BASE_URL: str = Field(
|
||||
default="http://comfyui:8188",
|
||||
description="ComfyUI server URL reachable from the open-webui container.",
|
||||
)
|
||||
TIMEOUT_SECONDS: int = Field(default=600)
|
||||
DEFAULT_WIDTH: int = Field(default=1024)
|
||||
DEFAULT_HEIGHT: int = Field(default=1024)
|
||||
DEFAULT_DENOISE_IMG2IMG: float = Field(default=0.7)
|
||||
DEFAULT_DENOISE_INPAINT: float = Field(default=1.0)
|
||||
FORCE_STYLE: str = Field(
|
||||
default="",
|
||||
description="Override style routing. Empty = auto-route. Set to "
|
||||
"one of: photo, juggernaut, pony, general, "
|
||||
"furry-nai, furry-noob, furry-il.",
|
||||
)
|
||||
|
||||
def __init__(self):
|
||||
self.valves = self.Valves()
|
||||
self.id = "image-studio-pipe"
|
||||
self.name = "Image Studio (Pipe)"
|
||||
|
||||
async def pipe(
|
||||
self,
|
||||
body: dict,
|
||||
__user__: Optional[dict] = None,
|
||||
__request__=None,
|
||||
__metadata__: Optional[dict] = None,
|
||||
__event_emitter__: Optional[Callable[[dict], Awaitable[None]]] = None,
|
||||
) -> str:
|
||||
user_text = _extract_user_text(body)
|
||||
if not user_text:
|
||||
return "Type a message describing the image you want."
|
||||
|
||||
async def emit(msg: str, done: bool = False):
|
||||
if __event_emitter__:
|
||||
await __event_emitter__({
|
||||
"type": "status",
|
||||
"data": {"description": msg, "done": done},
|
||||
})
|
||||
|
||||
# Style: explicit valve override > inherited from prior assistant
|
||||
# message > keyword detection on user text > default.
|
||||
chosen = (
|
||||
self.valves.FORCE_STYLE.strip()
|
||||
or _inherited_style(body.get("messages"))
|
||||
or _route_style(user_text)
|
||||
)
|
||||
if chosen not in STYLES:
|
||||
chosen = DEFAULT_STYLE
|
||||
settings = STYLES[chosen]
|
||||
|
||||
base = self.valves.COMFYUI_BASE_URL.rstrip("/")
|
||||
positive = f"{settings['prefix']}{user_text}"
|
||||
negative = settings["negative"]
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
await emit("Looking for attached image…")
|
||||
source_bytes = await _extract_attached_image(
|
||||
None, body.get("messages"), __metadata__, session,
|
||||
)
|
||||
|
||||
if source_bytes is None:
|
||||
# No image → txt2img
|
||||
await emit(f"Generating ({chosen})")
|
||||
workflow = _build_txt2img(
|
||||
positive, negative, settings,
|
||||
self.valves.DEFAULT_WIDTH, self.valves.DEFAULT_HEIGHT, 0,
|
||||
)
|
||||
tag = "gen"
|
||||
else:
|
||||
# Image present → upload, then inpaint or img2img
|
||||
uploaded = await _upload_to_comfyui(session, base, source_bytes)
|
||||
if not uploaded:
|
||||
return "Failed to upload source image to ComfyUI."
|
||||
|
||||
mask_text = _detect_mask_text(user_text)
|
||||
if mask_text:
|
||||
await emit(
|
||||
f"Inpainting ({chosen}, mask='{mask_text}', "
|
||||
f"denoise={self.valves.DEFAULT_DENOISE_INPAINT})"
|
||||
)
|
||||
workflow = _build_inpaint(
|
||||
positive, negative, settings, uploaded, mask_text,
|
||||
self.valves.DEFAULT_DENOISE_INPAINT, 0,
|
||||
)
|
||||
tag = f"edit (inpaint: {mask_text})"
|
||||
else:
|
||||
await emit(
|
||||
f"Editing ({chosen}, "
|
||||
f"denoise={self.valves.DEFAULT_DENOISE_IMG2IMG})"
|
||||
)
|
||||
workflow = _build_img2img(
|
||||
positive, negative, settings, uploaded,
|
||||
self.valves.DEFAULT_DENOISE_IMG2IMG, 0,
|
||||
)
|
||||
tag = "edit (img2img)"
|
||||
|
||||
raw, err = await _submit_and_fetch(
|
||||
session, base, workflow, self.valves.TIMEOUT_SECONDS, emit, settings,
|
||||
)
|
||||
if err:
|
||||
return err
|
||||
|
||||
await _push_image_to_chat(
|
||||
raw, "smartpipe", __request__, __user__, __metadata__, __event_emitter__,
|
||||
)
|
||||
await emit(f"Done — {chosen}", done=True)
|
||||
|
||||
# Single-line plain-English follow-up. Emit the style as
|
||||
# "style: <name>" so the inheritance helper can find it next turn.
|
||||
return f"Done — style: {chosen}, {tag}."
|
||||
28
install-custom-node-deps.sh
Normal file
28
install-custom-node-deps.sh
Normal file
@@ -0,0 +1,28 @@
|
||||
#!/bin/sh
|
||||
# Entrypoint wrapper. Pip-installs requirements.txt for any custom_node
|
||||
# present in /opt/comfyui/custom_nodes/, then exec's the CMD.
|
||||
#
|
||||
# This makes the container self-healing for custom nodes that get added
|
||||
# at runtime — either via ComfyUI-Manager from the web UI, or by
|
||||
# git-cloning directly into the comfyui-custom-nodes volume. Pip skips
|
||||
# already-satisfied requirements quickly, so the boot-time cost on
|
||||
# subsequent restarts is negligible.
|
||||
|
||||
set -e
|
||||
|
||||
if [ -d /opt/comfyui/custom_nodes ]; then
|
||||
for req in /opt/comfyui/custom_nodes/*/requirements.txt; do
|
||||
[ -f "$req" ] || continue
|
||||
echo "[entrypoint] installing $req"
|
||||
pip install -q -r "$req" || echo " (install failed — continuing)"
|
||||
done
|
||||
fi
|
||||
|
||||
# Force-pin known-incompatible packages back into a working range. Some
|
||||
# custom nodes bring transformers >=5 transitively, which removes
|
||||
# BertModel.get_head_mask and breaks comfyui_segment_anything's
|
||||
# GroundingDINO. Run last so it wins over anything the loop above
|
||||
# installed.
|
||||
pip install -q "transformers>=4.40,<5" || echo "[entrypoint] transformers pin failed — continuing"
|
||||
|
||||
exec "$@"
|
||||
@@ -3,10 +3,10 @@
|
||||
"class_type": "KSampler",
|
||||
"inputs": {
|
||||
"seed": 0,
|
||||
"steps": 20,
|
||||
"cfg": 7,
|
||||
"sampler_name": "euler",
|
||||
"scheduler": "normal",
|
||||
"steps": 28,
|
||||
"cfg": 4.0,
|
||||
"sampler_name": "dpmpp_2m_sde",
|
||||
"scheduler": "karras",
|
||||
"denoise": 0.75,
|
||||
"model": ["4", 0],
|
||||
"positive": ["6", 0],
|
||||
@@ -17,7 +17,7 @@
|
||||
"4": {
|
||||
"class_type": "CheckpointLoaderSimple",
|
||||
"inputs": {
|
||||
"ckpt_name": "v1-5-pruned-emaonly.safetensors"
|
||||
"ckpt_name": "CyberRealisticXLPlay_V8.0_FP16.safetensors"
|
||||
}
|
||||
},
|
||||
"6": {
|
||||
@@ -30,7 +30,7 @@
|
||||
"7": {
|
||||
"class_type": "CLIPTextEncode",
|
||||
"inputs": {
|
||||
"text": "",
|
||||
"text": "lowres, blurry, jpeg artifacts, watermark, text, signature, bad anatomy, extra limbs, missing fingers, deformed, ugly, low quality, worst quality",
|
||||
"clip": ["4", 1]
|
||||
}
|
||||
},
|
||||
|
||||
@@ -3,10 +3,10 @@
|
||||
"class_type": "KSampler",
|
||||
"inputs": {
|
||||
"seed": 0,
|
||||
"steps": 20,
|
||||
"cfg": 7,
|
||||
"sampler_name": "euler",
|
||||
"scheduler": "normal",
|
||||
"steps": 28,
|
||||
"cfg": 4.0,
|
||||
"sampler_name": "dpmpp_2m_sde",
|
||||
"scheduler": "karras",
|
||||
"denoise": 1,
|
||||
"model": ["4", 0],
|
||||
"positive": ["6", 0],
|
||||
@@ -17,7 +17,7 @@
|
||||
"4": {
|
||||
"class_type": "CheckpointLoaderSimple",
|
||||
"inputs": {
|
||||
"ckpt_name": "v1-5-pruned-emaonly.safetensors"
|
||||
"ckpt_name": "CyberRealisticXLPlay_V8.0_FP16.safetensors"
|
||||
}
|
||||
},
|
||||
"5": {
|
||||
@@ -38,7 +38,7 @@
|
||||
"7": {
|
||||
"class_type": "CLIPTextEncode",
|
||||
"inputs": {
|
||||
"text": "",
|
||||
"text": "lowres, blurry, jpeg artifacts, watermark, text, signature, bad anatomy, extra limbs, missing fingers, deformed, ugly, low quality, worst quality",
|
||||
"clip": ["4", 1]
|
||||
}
|
||||
},
|
||||
|
||||
Reference in New Issue
Block a user