Spaces:

DandyDonUnhinged
/

spock-body-lora-training

Build error

App Files Files Community

Sasha (Spock) commited on 4 days ago

Commit

7a18c1b

1 Parent(s): c8cd022

materialize ai-toolkit (binaries + bak files removed); fix Dockerfile with PYTHONPATH

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

Dockerfile +16 -10
ai-toolkit/.gitignore +187 -0
ai-toolkit/.gitmodules +0 -0
ai-toolkit/FAQ.md +10 -0
ai-toolkit/LICENSE +21 -0
ai-toolkit/README.md +316 -0
ai-toolkit/build_and_push_docker +29 -0
ai-toolkit/build_and_push_docker_dev +21 -0
ai-toolkit/config/examples/extract.example.yml +75 -0
ai-toolkit/config/examples/generate.example.yaml +60 -0
ai-toolkit/config/examples/mod_lora_scale.yaml +48 -0
ai-toolkit/config/examples/modal/modal_train_lora_flux_24gb.yaml +96 -0
ai-toolkit/config/examples/modal/modal_train_lora_flux_schnell_24gb.yaml +98 -0
ai-toolkit/config/examples/train_flex_redux.yaml +112 -0
ai-toolkit/config/examples/train_full_fine_tune_flex.yaml +107 -0
ai-toolkit/config/examples/train_full_fine_tune_lumina.yaml +99 -0
ai-toolkit/config/examples/train_lora_chroma_24gb.yaml +104 -0
ai-toolkit/config/examples/train_lora_flex2_24gb.yaml +165 -0
ai-toolkit/config/examples/train_lora_flex_24gb.yaml +101 -0
ai-toolkit/config/examples/train_lora_flux_24gb.yaml +96 -0
ai-toolkit/config/examples/train_lora_flux_kontext_24gb.yaml +106 -0
ai-toolkit/config/examples/train_lora_flux_schnell_24gb.yaml +98 -0
ai-toolkit/config/examples/train_lora_hidream_48.yaml +112 -0
ai-toolkit/config/examples/train_lora_lumina.yaml +96 -0
ai-toolkit/config/examples/train_lora_omnigen2_24gb.yaml +94 -0
ai-toolkit/config/examples/train_lora_qwen_image_24gb.yaml +95 -0
ai-toolkit/config/examples/train_lora_qwen_image_edit_2509_32gb.yaml +105 -0
ai-toolkit/config/examples/train_lora_qwen_image_edit_32gb.yaml +102 -0
ai-toolkit/config/examples/train_lora_sd35_large_24gb.yaml +97 -0
ai-toolkit/config/examples/train_lora_wan21_14b_24gb.yaml +101 -0
ai-toolkit/config/examples/train_lora_wan21_1b_24gb.yaml +90 -0
ai-toolkit/config/examples/train_lora_wan22_14b_24gb.yaml +111 -0
ai-toolkit/config/examples/train_slider.example.yml +230 -0
ai-toolkit/dgx_instructions.md +84 -0
ai-toolkit/dgx_requirements.txt +13 -0
ai-toolkit/docker-compose.yml +25 -0
ai-toolkit/docker/Dockerfile +108 -0
ai-toolkit/docker/start.sh +70 -0
ai-toolkit/extensions/example/ExampleMergeModels.py +129 -0
ai-toolkit/extensions/example/__init__.py +25 -0
ai-toolkit/extensions/example/config/config.example.yaml +48 -0
ai-toolkit/extensions_built_in/advanced_generator/Img2ImgGenerator.py +256 -0
ai-toolkit/extensions_built_in/advanced_generator/PureLoraGenerator.py +102 -0
ai-toolkit/extensions_built_in/advanced_generator/ReferenceGenerator.py +212 -0
ai-toolkit/extensions_built_in/advanced_generator/__init__.py +59 -0
ai-toolkit/extensions_built_in/advanced_generator/config/train.example.yaml +91 -0
ai-toolkit/extensions_built_in/audio_models/__init__.py +7 -0
ai-toolkit/extensions_built_in/audio_models/ace_step/__init__.py +1 -0
ai-toolkit/extensions_built_in/audio_models/ace_step/ace_step_15_model.py +335 -0
ai-toolkit/extensions_built_in/audio_models/ace_step/src/__init__.py +0 -0

Dockerfile CHANGED Viewed

@@ -1,22 +1,28 @@
 FROM pytorch/pytorch:2.4.0-cuda12.1-cudnn9-runtime
 WORKDIR /app
 RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
-# Install ai-toolkit
-RUN git clone https://github.com/ostris/ai-toolkit.git /app/ai-toolkit
-WORKDIR /app/ai-toolkit
-RUN git submodule update --init --recursive
-RUN pip install --no-cache-dir -e .
-# Install HF Hub
-RUN pip install --no-cache-dir huggingface_hub
 # Copy training files
 COPY . /app/
-# Pre-download FLUX model and assistant LoRA
-RUN python -c "from huggingface_hub import snapshot_download; snapshot_download('Niansuh/FLUX.1-schnell', cache_dir='/app/hf_cache'); snapshot_download('ostris/FLUX.1-schnell-training-adapter', cache_dir='/app/hf_cache')"
 CMD ["python", "/app/train_cloud.py"]

 FROM pytorch/pytorch:2.4.0-cuda12.1-cudnn9-runtime
 WORKDIR /app
+# System deps
 RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
+# Pre-baked ai-toolkit (numpy 2.5 / dctorch / torchdiffeq / torchsde / clip forks, 3.14-compatible)
+# Copied from local checkout so HF doesn't have to clone the submodule / run pip install -e .
+COPY ai-toolkit /app/ai-toolkit
+# Make ai-toolkit importable. Upstream ai-toolkit ships without setup.py / pyproject.toml,
+# so pip install -e . would fail. We add it to PYTHONPATH instead.
+ENV PYTHONPATH=/app:/app/ai-toolkit
+ENV PYTHONUNBUFFERED=1
+ENV HF_HOME=/app/hf_cache
+ENV TRANSFORMERS_CACHE=/app/hf_cache
+# Install runtime deps
+RUN pip install --no-cache-dir huggingface_hub hf_transfer
 # Copy training files
 COPY . /app/
+# Pre-download FLUX base + training adapter at build time so they're in the image cache
+RUN python -c "import os; os.environ['HF_HUB_ENABLE_HF_TRANSFER']='1'; from huggingface_hub import snapshot_download; snapshot_download('Niansuh/FLUX.1-schnell'); snapshot_download('ostris/FLUX.1-schnell-training-adapter')"
 CMD ["python", "/app/train_cloud.py"]

ai-toolkit/.gitignore ADDED Viewed

	@@ -0,0 +1,187 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+.python
+.node
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+/env.sh
+/models
+/datasets
+/custom/*
+!/custom/.gitkeep
+/.tmp
+/venv.bkp
+/venv.*
+/config/*
+!/config/examples
+!/config/_PUT_YOUR_CONFIGS_HERE).txt
+/output/*
+!/output/.gitkeep
+/extensions/*
+!/extensions/example
+/temp
+/wandb
+.vscode/settings.json
+.DS_Store
+._.DS_Store
+aitk_db.db
+/notes.md
+/data
+.claude

ai-toolkit/.gitmodules ADDED Viewed

File without changes

ai-toolkit/FAQ.md ADDED Viewed

	@@ -0,0 +1,10 @@

+# FAQ
+WIP. Will continue to add things as they are needed.
+## FLUX.1 Training
+#### How much VRAM is required to train a lora on FLUX.1?
+24GB minimum is required.

ai-toolkit/LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Ostris, LLC
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

ai-toolkit/README.md ADDED Viewed

	@@ -0,0 +1,316 @@

+# Ostris AI Toolkit
+AI Toolkit is an easy to use all in one training suite for diffusion models. I try to support all the latest models on consumer grade hardware. Image and video models. It can be run as a GUI or CLI. It is designed to be easy to use but still have every feature imaginable. Free and open source.
+## Supported Models
+### Image
+- [black-forest-labs/FLUX.1-dev](https://huggingface.co/black-forest-labs/FLUX.1-dev) (FLUX.1)
+- [black-forest-labs/FLUX.2-dev](https://huggingface.co/black-forest-labs/FLUX.2-dev) (FLUX.2)
+- [black-forest-labs/FLUX.2-klein-base-4B](https://huggingface.co/black-forest-labs/FLUX.2-klein-base-4B) (FLUX.2-klein-base-4B)
+- [black-forest-labs/FLUX.2-klein-base-9B](https://huggingface.co/black-forest-labs/FLUX.2-klein-base-9B) (FLUX.2-klein-base-9B)
+- [ostris/Flex.1-alpha](https://huggingface.co/ostris/Flex.1-alpha) (Flex.1)
+- [ostris/Flex.2-preview](https://huggingface.co/ostris/Flex.2-preview) (Flex.2)
+- [lodestones/Chroma1-Base](https://huggingface.co/lodestones/Chroma1-Base) (Chroma)
+- [Alpha-VLLM/Lumina-Image-2.0](https://huggingface.co/Alpha-VLLM/Lumina-Image-2.0) (Lumina2)
+- [Qwen/Qwen-Image](https://huggingface.co/Qwen/Qwen-Image) (Qwen-Image)
+- [Qwen/Qwen-Image-2512](https://huggingface.co/Qwen/Qwen-Image-2512) (Qwen-Image-2512)
+- [HiDream-ai/HiDream-I1-Full](https://huggingface.co/HiDream-ai/HiDream-I1-Full) (HiDream I1)
+- [OmniGen2/OmniGen2](https://huggingface.co/OmniGen2/OmniGen2) (OmniGen2)
+- [Tongyi-MAI/Z-Image-Turbo](https://huggingface.co/Tongyi-MAI/Z-Image-Turbo) (Z-Image Turbo)
+- [Tongyi-MAI/Z-Image](https://huggingface.co/Tongyi-MAI/Z-Image) (Z-Image)
+- [ostris/Z-Image-De-Turbo](https://huggingface.co/ostris/Z-Image-De-Turbo) (Z-Image De-Turbo)
+- [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) (SDXL)
+- [stable-diffusion-v1-5/stable-diffusion-v1-5](https://huggingface.co/stable-diffusion-v1-5/stable-diffusion-v1-5) (SD 1.5)
+- [baidu/ERNIE-Image](https://huggingface.co/baidu/ERNIE-Image) (ERNIE-Image)
+- [NucleusAI/Nucleus-Image](https://huggingface.co/NucleusAI/Nucleus-Image) (Nucleus-Image)
+- [HiDream-ai/HiDream-O1-Image](https://huggingface.co/HiDream-ai/HiDream-O1-Image) (HiDream O1)
+- [Photoroom/prxpixel-t2i](https://huggingface.co/Photoroom/prxpixel-t2i) (PRXPixel)
+### Instruction / Edit
+- [black-forest-labs/FLUX.1-Kontext-dev](https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev) (FLUX.1-Kontext-dev)
+- [Qwen/Qwen-Image-Edit](https://huggingface.co/Qwen/Qwen-Image-Edit) (Qwen-Image-Edit)
+- [Qwen/Qwen-Image-Edit-2509](https://huggingface.co/Qwen/Qwen-Image-Edit-2509) (Qwen-Image-Edit-2509)
+- [Qwen/Qwen-Image-Edit-2511](https://huggingface.co/Qwen/Qwen-Image-Edit-2511) (Qwen-Image-Edit-2511)
+- [HiDream-ai/HiDream-E1-1](https://huggingface.co/HiDream-ai/HiDream-E1-1) (HiDream E1)
+### Video
+- [Wan-AI/Wan2.1-T2V-1.3B-Diffusers](https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B-Diffusers) (Wan 2.1 1.3B)
+- [Wan-AI/Wan2.1-I2V-14B-480P-Diffusers](https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-480P-Diffusers) (Wan 2.1 I2V 14B-480P)
+- [Wan-AI/Wan2.1-I2V-14B-720P-Diffusers](https://huggingface.co/Wan-AI/Wan2.1-I2V-14B-720P-Diffusers) (Wan 2.1 I2V 14B-720P)
+- [Wan-AI/Wan2.1-T2V-14B-Diffusers](https://huggingface.co/Wan-AI/Wan2.1-T2V-14B-Diffusers) (Wan 2.1 14B)
+- [Wan-AI/Wan2.2-T2V-A14B-Diffusers](https://huggingface.co/Wan-AI/Wan2.2-T2V-A14B-Diffusers) (Wan 2.2 14B)
+- [Wan-AI/Wan2.2-I2V-A14B-Diffusers](https://huggingface.co/Wan-AI/Wan2.2-I2V-A14B-Diffusers) (Wan 2.2 I2V 14B)
+- [Wan-AI/Wan2.2-TI2V-5B-Diffusers](https://huggingface.co/Wan-AI/Wan2.2-TI2V-5B-Diffusers) (Wan 2.2 TI2V 5B)
+- [Lightricks/LTX-2](https://huggingface.co/Lightricks/LTX-2) (LTX-2)
+- [Lightricks/LTX-2.3](https://huggingface.co/Lightricks/LTX-2.3) (LTX-2.3)
+- [krea/Krea-2-Raw](https://huggingface.co/krea/Krea-2-Raw) (Krea 2)
+### Audio
+- [ACE-Step/Ace-Step1.5](https://huggingface.co/ACE-Step/Ace-Step1.5) (Ace Step 1.5)
+- [ACE-Step/acestep-v15-xl-base](https://huggingface.co/ACE-Step/acestep-v15-xl-base) (Ace Step 1.5 XL)
+### Experimental
+- [lodestones/Zeta-Chroma](https://huggingface.co/lodestones/Zeta-Chroma) (Zeta Chroma)
+- [ideogram-ai/ideogram-4-fp8](https://huggingface.co/ideogram-ai/ideogram-4-fp8) (Ideogram 4 FP8)
+## Installation
+Requirements:
+- python >=3.10 (3.12 recommended)
+- Nvidia GPU with enough ram to do what you need
+- python venv
+- git
+Linux:
+```bash
+git clone https://github.com/ostris/ai-toolkit.git
+cd ai-toolkit
+python3 -m venv venv
+source venv/bin/activate
+# install torch first
+pip3 install --no-cache-dir torch==2.9.1 torchvision==0.24.1 torchaudio==2.9.1 --index-url https://download.pytorch.org/whl/cu128
+pip3 install -r requirements.txt
+```
+For devices running **DGX OS** (including DGX Spark), follow [these](dgx_instructions.md) instructions.
+Windows:
+If you are having issues with Windows. I recommend using the easy install script at [https://github.com/Tavris1/AI-Toolkit-Easy-Install](https://github.com/Tavris1/AI-Toolkit-Easy-Install)
+```bash
+git clone https://github.com/ostris/ai-toolkit.git
+cd ai-toolkit
+python -m venv venv
+.\venv\Scripts\activate
+pip install --no-cache-dir torch==2.9.1 torchvision==0.24.1 torchaudio==2.9.1 --index-url https://download.pytorch.org/whl/cu128
+pip install -r requirements.txt
+```
+MacOS:
+Experimental support for Silicon Macs is available. I do not have a Mac with enough RAM to fully test this
+so please let me know if there are issues. There is a convience script to install and run on MacOS
+locates at `./run_mac.zsh` that will install the dependencies locally and run the UI. To run this,
+do the following:
+```bash
+git clone https://github.com/ostris/ai-toolkit.git
+cd ai-toolkit
+chmod +x run_mac.zsh
+./run_mac.zsh
+```
+# AI Toolkit UI
+<img src="https://ostris.com/wp-content/uploads/2025/02/toolkit-ui.jpg" alt="AI Toolkit UI" width="100%">
+The AI Toolkit UI is a web interface for the AI Toolkit. It allows you to easily start, stop, and monitor jobs. It also allows you to easily train models with a few clicks. It also allows you to set a token for the UI to prevent unauthorized access so it is mostly safe to run on an exposed server.
+## Running the UI
+Requirements:
+- Node.js > 20
+The UI does not need to be kept running for the jobs to run. It is only needed to start/stop/monitor jobs. The commands below
+will install / update the UI and it's dependencies and start the UI.
+```bash
+cd ui
+npm run build_and_start
+```
+You can now access the UI at `http://localhost:8675` or `http://<your-ip>:8675` if you are running it on a server.
+## Securing the UI
+If you are hosting the UI on a cloud provider or any network that is not secure, I highly recommend securing it with an auth token.
+You can do this by setting the environment variable `AI_TOOLKIT_AUTH` to super secure password. This token will be required to access
+the UI. You can set this when starting the UI like so:
+```bash
+# Linux
+AI_TOOLKIT_AUTH=super_secure_password npm run build_and_start
+# Windows
+set AI_TOOLKIT_AUTH=super_secure_password && npm run build_and_start
+# Windows Powershell
+$env:AI_TOOLKIT_AUTH="super_secure_password"; npm run build_and_start
+```
+### Training
+1. Copy the example config file located at `config/examples/train_lora_flux_24gb.yaml` (`config/examples/train_lora_flux_schnell_24gb.yaml` for schnell) to the `config` folder and rename it to `whatever_you_want.yml`
+2. Edit the file following the comments in the file
+3. Run the file like so `python run.py config/whatever_you_want.yml`
+A folder with the name and the training folder from the config file will be created when you start. It will have all
+checkpoints and images in it. You can stop the training at any time using ctrl+c and when you resume, it will pick back up
+from the last checkpoint.
+IMPORTANT. If you press crtl+c while it is saving, it will likely corrupt that checkpoint. So wait until it is done saving
+### Need help?
+Please do not open a bug report unless it is a bug in the code. You are welcome to [Join my Discord](https://discord.gg/VXmU2f5WEU)
+and ask for help there. However, please refrain from PMing me directly with general question or support. Ask in the discord
+and I will answer when I can.
+## Ostris Cloud
+You can use many cloud providers to rent GPUs. If you want to help support this project in the largest way possible, please consider using [Ostris Cloud](https://cloud.ostris.com). Ostris Cloud is owned and operated by me, Ostris, and every dollar earned goes directly back into funding the development of this project.
+<a href="https://cloud.ostris.com" target="_blank"><img src="https://cloud.ostris.com/api/og" alt="Ostris Cloud" style="max-width:100%;width:600px;height:auto;"></a>
+## Training in RunPod
+If you would like to use Runpod, but have not signed up yet, please consider using [my Runpod affiliate link](https://runpod.io?ref=h0y9jyr2) to help support this project.
+I maintain an official Runpod Pod template here which can be accessed [here](https://console.runpod.io/deploy?template=0fqzfjy6f3&ref=h0y9jyr2).
+I have also created a short video showing how to get started using AI Toolkit with Runpod [here](https://youtu.be/HBNeS-F6Zz8).
+## Training in Modal
+### 1. Setup
+#### ai-toolkit:
+```
+git clone https://github.com/ostris/ai-toolkit.git
+cd ai-toolkit
+git submodule update --init --recursive
+python -m venv venv
+source venv/bin/activate
+pip install torch
+pip install -r requirements.txt
+pip install --upgrade accelerate transformers diffusers huggingface_hub #Optional, run it if you run into issues
+```
+#### Modal:
+- Run `pip install modal` to install the modal Python package.
+- Run `modal setup` to authenticate (if this doesn’t work, try `python -m modal setup`).
+#### Hugging Face:
+- Get a READ token from [here](https://huggingface.co/settings/tokens) and request access to Flux.1-dev model from [here](https://huggingface.co/black-forest-labs/FLUX.1-dev).
+- Run `huggingface-cli login` and paste your token.
+### 2. Upload your dataset
+- Drag and drop your dataset folder containing the .jpg, .jpeg, or .png images and .txt files in `ai-toolkit`.
+### 3. Configs
+- Copy an example config file located at ```config/examples/modal``` to the `config` folder and rename it to ```whatever_you_want.yml```.
+- Edit the config following the comments in the file, **<ins>be careful and follow the example `/root/ai-toolkit` paths</ins>**.
+### 4. Edit run_modal.py
+- Set your entire local `ai-toolkit` path at `code_mount = modal.Mount.from_local_dir` like:
+   ```
+   code_mount = modal.Mount.from_local_dir("/Users/username/ai-toolkit", remote_path="/root/ai-toolkit")
+   ```
+- Choose a `GPU` and `Timeout` in `@app.function` _(default is A100 40GB and 2 hour timeout)_.
+### 5. Training
+- Run the config file in your terminal: `modal run run_modal.py --config-file-list-str=/root/ai-toolkit/config/whatever_you_want.yml`.
+- You can monitor your training in your local terminal, or on [modal.com](https://modal.com/).
+- Models, samples and optimizer will be stored in `Storage > flux-lora-models`.
+### 6. Saving the model
+- Check contents of the volume by running `modal volume ls flux-lora-models`.
+- Download the content by running `modal volume get flux-lora-models your-model-name`.
+- Example: `modal volume get flux-lora-models my_first_flux_lora_v1`.
+### Screenshot from Modal
+<img width="1728" alt="Modal Traning Screenshot" src="https://github.com/user-attachments/assets/7497eb38-0090-49d6-8ad9-9c8ea7b5388b">
+---
+## Dataset Preparation
+Datasets generally need to be a folder containing images and associated text files. Currently, the only supported
+formats are jpg, jpeg, and png. Webp currently has issues. The text files should be named the same as the images
+but with a `.txt` extension. For example `image2.jpg` and `image2.txt`. The text file should contain only the caption.
+You can add the word `[trigger]` in the caption file and if you have `trigger_word` in your config, it will be automatically
+replaced.
+Images are never upscaled but they are downscaled and placed in buckets for batching. **You do not need to crop/resize your images**.
+The loader will automatically resize them and can handle varying aspect ratios.
+## Training Specific Layers
+To train specific layers with LoRA, you can use the `only_if_contains` network kwargs. For instance, if you want to train only the 2 layers
+used by The Last Ben, [mentioned in this post](https://x.com/__TheBen/status/1829554120270987740), you can adjust your
+network kwargs like so:
+```yaml
+      network:
+        type: "lora"
+        linear: 128
+        linear_alpha: 128
+        network_kwargs:
+          only_if_contains:
+            - "transformer.single_transformer_blocks.7.proj_out"
+            - "transformer.single_transformer_blocks.20.proj_out"
+```
+The naming conventions of the layers are in diffusers format, so checking the state dict of a model will reveal
+the suffix of the name of the layers you want to train. You can also use this method to only train specific groups of weights.
+For instance to only train the `single_transformer` for FLUX.1, you can use the following:
+```yaml
+      network:
+        type: "lora"
+        linear: 128
+        linear_alpha: 128
+        network_kwargs:
+          only_if_contains:
+            - "transformer.single_transformer_blocks."
+```
+You can also exclude layers by their names by using `ignore_if_contains` network kwarg. So to exclude all the single transformer blocks,
+```yaml
+      network:
+        type: "lora"
+        linear: 128
+        linear_alpha: 128
+        network_kwargs:
+          ignore_if_contains:
+            - "transformer.single_transformer_blocks."
+```
+`ignore_if_contains` takes priority over `only_if_contains`. So if a weight is covered by both,
+if will be ignored.
+## LoKr Training
+To learn more about LoKr, read more about it at [KohakuBlueleaf/LyCORIS](https://github.com/KohakuBlueleaf/LyCORIS/blob/main/docs/Guidelines.md). To train a LoKr model, you can adjust the network type in the config file like so:
+```yaml
+      network:
+        type: "lokr"
+        lokr_full_rank: true
+        lokr_factor: 8
+```
+Everything else should work the same including layer targeting.
+## Support My Work
+If you enjoy my projects or use them commercially, please consider sponsoring me. Every bit helps! 💖
+<a href="https://ostris.com/sponsors" target="_blank"><img src="https://ostris.com/wp-content/uploads/2025/05/support-banner2.png" alt="Support my work" style="max-width:100%;height:auto;"></a>
+### Current Sponsors
+All of these people / organizations are the ones who selflessly make this project possible. Thank you!!
+<a href="https://ostris.com/sponsors"><img src="https://ostris.com/sponsors.svg" alt="Sponsors" style="width:100%;height:auto;"></a>

ai-toolkit/build_and_push_docker ADDED Viewed

	@@ -0,0 +1,29 @@

+#!/usr/bin/env bash
+# Extract version from version.py
+if [ -f "version.py" ]; then
+    VERSION=$(python3 -c "from version import VERSION; print(VERSION)")
+    echo "Building version: $VERSION"
+else
+    echo "Error: version.py not found. Please create a version.py file with VERSION defined."
+    exit 1
+fi
+echo "Docker builds from the repo, not this dir. Make sure changes are pushed to the repo."
+echo "Building version: $VERSION and latest"
+# wait 2 seconds
+sleep 2
+# Build the image with cache busting
+docker build --build-arg CACHEBUST=$(date +%s) -t aitoolkit:$VERSION -f docker/Dockerfile .
+# Tag with version and latest
+docker tag aitoolkit:$VERSION ostris/aitoolkit:$VERSION
+docker tag aitoolkit:$VERSION ostris/aitoolkit:latest
+# Push both tags
+echo "Pushing images to Docker Hub..."
+docker push ostris/aitoolkit:$VERSION
+docker push ostris/aitoolkit:latest
+echo "Successfully built and pushed ostris/aitoolkit:$VERSION and ostris/aitoolkit:latest"

ai-toolkit/build_and_push_docker_dev ADDED Viewed

	@@ -0,0 +1,21 @@

+#!/usr/bin/env bash
+VERSION=dev
+GIT_COMMIT=dev
+echo "Docker builds from the repo, not this dir. Make sure changes are pushed to the repo."
+echo "Building version: $VERSION"
+# wait 2 seconds
+sleep 2
+# Build the image with cache busting
+docker build --build-arg CACHEBUST=$(date +%s) -t aitoolkit:$VERSION -f docker/Dockerfile .
+# Tag with version and latest
+docker tag aitoolkit:$VERSION ostris/aitoolkit:$VERSION
+# Push both tags
+echo "Pushing images to Docker Hub..."
+docker push ostris/aitoolkit:$VERSION
+echo "Successfully built and pushed ostris/aitoolkit:$VERSION"

ai-toolkit/config/examples/extract.example.yml ADDED Viewed

	@@ -0,0 +1,75 @@

+---
+# this is in yaml format. You can use json if you prefer
+# I like both but yaml is easier to read and write
+# plus it has comments which is nice for documentation
+job: extract # tells the runner what to do
+config:
+  # the name will be used to create a folder in the output folder
+  # it will also replace any [name] token in the rest of this config
+  name: name_of_your_model
+  # can be hugging face model, a .ckpt, or a .safetensors
+  base_model: "/path/to/base/model.safetensors"
+  # can be hugging face model, a .ckpt, or a .safetensors
+  extract_model: "/path/to/model/to/extract/trained.safetensors"
+  # we will create folder here with name above so. This will create /path/to/output/folder/name_of_your_model
+  output_folder: "/path/to/output/folder"
+  is_v2: false
+  dtype: fp16 # saved dtype
+  device: cpu # cpu, cuda:0, etc
+  # processes can be chained like this to run multiple in a row
+  # they must all use same models above, but great for testing different
+  # sizes and typed of extractions. It is much faster as we already have the models loaded
+  process:
+  # process 1
+  - type: locon  # locon or lora (locon is lycoris)
+    filename: "[name]_64_32.safetensors" # will be put in output folder
+    dtype: fp16
+    mode: fixed
+    linear: 64
+    conv: 32
+  # process 2
+  - type: locon
+    output_path: "/absolute/path/for/this/output.safetensors" # can be absolute
+    mode: ratio
+    linear: 0.2
+    conv: 0.2
+  # process 3
+  - type: locon
+    filename: "[name]_ratio_02.safetensors"
+    mode: quantile
+    linear: 0.5
+    conv: 0.5
+  # process 4
+  - type: lora  # traditional lora extraction (lierla) with linear layers only
+    filename: "[name]_4.safetensors"
+    mode: fixed  # fixed, ratio, quantile supported for lora as well
+    linear: 4 # lora dim or rank
+    # no conv for lora
+  # process 5
+  - type: lora
+    filename: "[name]_q05.safetensors"
+    mode: quantile
+    linear: 0.5
+# you can put any information you want here, and it will be saved in the model
+# the below is an example. I recommend doing trigger words at a minimum
+# in the metadata. The software will include this plus some other information
+meta:
+  name: "[name]"  # [name] gets replaced with the name above
+  description: A short description of your model
+  trigger_words:
+  - put
+  - trigger
+  - words
+  - here
+  version: '0.1'
+  creator:
+    name: Your Name
+    email: your@email.com
+    website: https://yourwebsite.com
+  any: All meta data above is arbitrary, it can be whatever you want.

ai-toolkit/config/examples/generate.example.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+---
+job: generate # tells the runner what to do
+config:
+  name: "generate" # this is not really used anywhere currently but required by runner
+  process:
+    # process 1
+    - type: to_folder  # process images to a folder
+      output_folder: "output/gen"
+      device: cuda:0 # cpu, cuda:0, etc
+      generate:
+        # these are your defaults you can override most of them with flags
+        sampler: "ddpm" # ignored for now, will add later though ddpm is used regardless for now
+        width: 1024
+        height: 1024
+        neg: "cartoon, fake, drawing, illustration, cgi, animated, anime"
+        seed: -1 # -1 is random
+        guidance_scale: 7
+        sample_steps: 20
+        ext: ".png" # .png, .jpg, .jpeg, .webp
+        # here ate the flags you can use for prompts. Always start with
+        # your prompt first then add these flags after. You can use as many
+        # like
+        # photo of a baseball --n painting, ugly --w 1024 --h 1024 --seed 42 --cfg 7 --steps 20
+        # we will try to support all sd-scripts flags where we can
+        # FROM SD-SCRIPTS
+        # --n Treat everything until the next option as a negative prompt.
+        # --w Specify the width of the generated image.
+        # --h Specify the height of the generated image.
+        # --d Specify the seed for the generated image.
+        # --l Specify the CFG scale for the generated image.
+        # --s Specify the number of steps during generation.
+        # OURS and some QOL additions
+        # --p2 Prompt for the second text encoder (SDXL only)
+        # --n2 Negative prompt for the second text encoder (SDXL only)
+        # --gr Specify the guidance rescale for the generated image (SDXL only)
+        # --seed Specify the seed for the generated image same as --d
+        # --cfg Specify the CFG scale for the generated image same as --l
+        # --steps Specify the number of steps during generation same as --s
+        prompt_file: false # if true a txt file will be created next to images with prompt strings used
+        # prompts can also be a path to a text file with one prompt per line
+        # prompts: "/path/to/prompts.txt"
+        prompts:
+          - "photo of batman"
+          - "photo of superman"
+          - "photo of spiderman"
+          - "photo of a superhero --n batman superman spiderman"
+      model:
+        # huggingface name, relative prom project path, or absolute path to .safetensors or .ckpt
+        #      name_or_path: "runwayml/stable-diffusion-v1-5"
+        name_or_path: "/mnt/Models/stable-diffusion/models/stable-diffusion/Ostris/Ostris_Real_v1.safetensors"
+        is_v2: false  # for v2 models
+        is_v_pred: false # for v-prediction models (most v2 models)
+        is_xl: false  # for SDXL models
+        dtype: bf16

ai-toolkit/config/examples/mod_lora_scale.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+---
+job: mod
+config:
+  name: name_of_your_model_v1
+  process:
+    - type: rescale_lora
+      # path to your current lora model
+      input_path: "/path/to/lora/lora.safetensors"
+      # output path for your new lora model, can be the same as input_path to replace
+      output_path: "/path/to/lora/output_lora_v1.safetensors"
+      # replaces meta with the meta below (plus minimum meta fields)
+      # if false, we will leave the meta alone except for updating hashes (sd-script hashes)
+      replace_meta: true
+      # how to adjust, we can scale the up_down weights or the alpha
+      # up_down is the default and probably the best, they will both net the same outputs
+      # would only affect rare NaN cases and maybe merging with old merge tools
+      scale_target: 'up_down'
+      # precision to save, fp16 is the default and standard
+      save_dtype: fp16
+      # current_weight is the ideal weight you use as a multiplier when using the lora
+      # IE in automatic1111 <lora:my_lora:6.0> the 6.0 is the current_weight
+      # you can do negatives here too if you want to flip the lora
+      current_weight: 6.0
+      # target_weight is the ideal weight you use as a multiplier when using the lora
+      # instead of the one above. IE in automatic1111 instead of using <lora:my_lora:6.0>
+      # we want to use <lora:my_lora:1.0> so 1.0 is the target_weight
+      target_weight: 1.0
+      # base model for the lora
+      # this is just used to add meta so automatic111 knows which model it is for
+      # assume v1.5 if these are not set
+      is_xl: false
+      is_v2: false
+meta:
+  # this is only used if you set replace_meta to true above
+  name: "[name]"  # [name] gets replaced with the name above
+  description: A short description of your lora
+  trigger_words:
+  - put
+  - trigger
+  - words
+  - here
+  version: '0.1'
+  creator:
+    name: Your Name
+    email: your@email.com
+    website: https://yourwebsite.com
+  any: All meta data above is arbitrary, it can be whatever you want.

ai-toolkit/config/examples/modal/modal_train_lora_flux_24gb.yaml ADDED Viewed

	@@ -0,0 +1,96 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flux_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "/root/ai-toolkit/modal_output" # must match MOUNT_DIR from run_modal.py
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        # your dataset must be placed in /ai-toolkit and /root is for modal to find the dir:
+        - folder_path: "/root/ai-toolkit/your-dataset"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # flux enjoys multiple resolutions
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation_steps: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with flux
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new vell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for flux, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        # if you get an error, or get stuck while downloading,
+        # check https://github.com/ostris/ai-toolkit/issues/84, download the model locally and
+        # place it like "/root/ai-toolkit/FLUX.1-dev"
+        name_or_path: "black-forest-labs/FLUX.1-dev"
+        is_flux: true
+        quantize: true  # run 8bit mixed precision
+#        low_vram: true  # uncomment this if the GPU is connected to your monitors. It will use less vram to quantize, but is slower.
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # not used on flux
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 20
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

ai-toolkit/config/examples/modal/modal_train_lora_flux_schnell_24gb.yaml ADDED Viewed

	@@ -0,0 +1,98 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flux_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "/root/ai-toolkit/modal_output" # must match MOUNT_DIR from run_modal.py
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        # your dataset must be placed in /ai-toolkit and /root is for modal to find the dir:
+        - folder_path: "/root/ai-toolkit/your-dataset"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # flux enjoys multiple resolutions
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation_steps: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with flux
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new vell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for flux, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        # if you get an error, or get stuck while downloading,
+        # check https://github.com/ostris/ai-toolkit/issues/84, download the models locally and
+        # place them like "/root/ai-toolkit/FLUX.1-schnell" and "/root/ai-toolkit/FLUX.1-schnell-training-adapter"
+        name_or_path: "black-forest-labs/FLUX.1-schnell"
+        assistant_lora_path: "ostris/FLUX.1-schnell-training-adapter" # Required for flux schnell training
+        is_flux: true
+        quantize: true  # run 8bit mixed precision
+        # low_vram is painfully slow to fuse in the adapter avoid it unless absolutely necessary
+#        low_vram: true  # uncomment this if the GPU is connected to your monitors. It will use less vram to quantize, but is slower.
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # not used on flux
+        seed: 42
+        walk_seed: true
+        guidance_scale: 1  # schnell does not do guidance
+        sample_steps: 4  # 1 - 4 works well
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

ai-toolkit/config/examples/train_flex_redux.yaml ADDED Viewed

	@@ -0,0 +1,112 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flex_redux_finetune_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      adapter:
+        type: "redux"
+        # you can finetune an existing adapter or start from scratch. Set to null to start from scratch
+        name_or_path: '/local/path/to/redux_adapter_to_finetune.safetensors'
+        # name_or_path: null
+        # image_encoder_path: 'google/siglip-so400m-patch14-384' # Flux.1 redux adapter
+        image_encoder_path: 'google/siglip2-so400m-patch16-512' # Flex.1 512 redux adapter
+        # image_encoder_arch: 'siglip' # for Flux.1
+        image_encoder_arch: 'siglip2'
+        # You need a control input for each sample. Best to do squares for both images
+        test_img_path:
+          - "/path/to/x_01.jpg"
+          - "/path/to/x_02.jpg"
+          - "/path/to/x_03.jpg"
+          - "/path/to/x_04.jpg"
+          - "/path/to/x_05.jpg"
+          - "/path/to/x_06.jpg"
+          - "/path/to/x_07.jpg"
+          - "/path/to/x_08.jpg"
+          - "/path/to/x_09.jpg"
+          - "/path/to/x_10.jpg"
+        clip_layer: 'last_hidden_state'
+        train: true
+      save:
+        dtype: bf16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          # clip_image_path is directory containting your control images. They must have filename as their train image. (extension does not matter)
+          # for normal redux, we are just recreating the same image, so you can use the same folder path above
+          clip_image_path: "/path/to/control/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          resolution: [ 512, 768, 1024 ]  # flex enjoys multiple resolutions
+      train:
+        # this is what I used for the 24GB card, but feel free to adjust
+        # total batch size is 6 here
+        batch_size: 3
+        gradient_accumulation: 2
+        # captions are not needed for this training, we cache a blank proompt and rely on the vision encoder
+        unload_text_encoder: true
+        loss_type: "mse"
+        train_unet: true
+        train_text_encoder: false
+        steps: 4000000  # I set this very high and stop when I like the results
+        content_or_style: balanced  # content, style, balanced
+        gradient_checkpointing: true
+        noise_scheduler: "flowmatch" # or "ddpm", "lms", "euler_a"
+        timestep_type: "flux_shift"
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # this is for Flex.1, comment this out for FLUX.1-dev
+        bypass_guidance_embedding: true
+        dtype: bf16
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+      model:
+        name_or_path: "ostris/Flex.1-alpha"
+        is_flux: true
+        quantize: true
+        text_encoder_bits: 8
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        # I leave half blank to test prompt and unprompted
+        prompts:
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - ""
+          - ""
+          - ""
+          - ""
+          - ""
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 25
+        network_multiplier: 1.0
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

ai-toolkit/config/examples/train_full_fine_tune_flex.yaml ADDED Viewed

	@@ -0,0 +1,107 @@

+---
+# This configuration requires 48GB of VRAM or more to operate
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flex_finetune_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+      # performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+      # trigger_word: "p3r5on"
+      save:
+        dtype: bf16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 2 # how many intermittent saves to keep
+        save_format: 'diffusers' # 'diffusers'
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          # cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # flex enjoys multiple resolutions
+      train:
+        batch_size: 1
+        # IMPORTANT! For Flex, you must bypass the guidance embedder during training
+        bypass_guidance_embedding: true
+        # can be 'sigmoid', 'linear', or 'lognorm_blend'
+        timestep_type: 'sigmoid'
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with flex
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adafactor"
+        lr: 3e-5
+        # Paramiter swapping can reduce vram requirements. Set factor from 1.0 to 0.0.
+        # 0.1 is 10% of paramiters active at easc step. Only works with adafactor
+        # do_paramiter_swapping: true
+        # paramiter_swapping_factor: 0.9
+        # uncomment this to skip the pre training sample
+        # skip_first_sample: true
+        # uncomment to completely disable sampling
+        # disable_sampling: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on if you have the vram
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for flex, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "ostris/Flex.1-alpha"
+        is_flux: true # flex is flux architecture
+        # full finetuning quantized models is a crapshoot and results in subpar outputs
+        # quantize: true
+        # you can quantize just the T5 text encoder here to save vram
+        quantize_te: true
+        # only train the transformer blocks
+        only_if_contains:
+          - "transformer.transformer_blocks."
+          - "transformer.single_transformer_blocks."
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+          # - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # not used on flex
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

ai-toolkit/config/examples/train_full_fine_tune_lumina.yaml ADDED Viewed

	@@ -0,0 +1,99 @@

+---
+# This configuration requires 24GB of VRAM or more to operate
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_lumina_finetune_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+      # performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+      # trigger_word: "p3r5on"
+      save:
+        dtype: bf16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 2 # how many intermittent saves to keep
+        save_format: 'diffusers' # 'diffusers'
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          # cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # lumina2 enjoys multiple resolutions
+      train:
+        batch_size: 1
+        # can be 'sigmoid', 'linear', or 'lumina2_shift'
+        timestep_type: 'lumina2_shift'
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with lumina2
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adafactor"
+        lr: 3e-5
+        # Paramiter swapping can reduce vram requirements. Set factor from 1.0 to 0.0.
+        # 0.1 is 10% of paramiters active at easc step. Only works with adafactor
+        # do_paramiter_swapping: true
+        # paramiter_swapping_factor: 0.9
+        # uncomment this to skip the pre training sample
+        # skip_first_sample: true
+        # uncomment to completely disable sampling
+        # disable_sampling: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on if you have the vram
+        # ema_config:
+        #   use_ema: true
+        #   ema_decay: 0.99
+        # will probably need this if gpu supports it for lumina2, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "Alpha-VLLM/Lumina-Image-2.0"
+        is_lumina2: true # lumina2 architecture
+        # you can quantize just the Gemma2 text encoder here to save vram
+        quantize_te: true
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+          # - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a cat that is half black and half orange tabby, split down the middle. The cat has on a blue tophat. They are holding a martini glass with a pink ball of yarn in it with green knitting needles sticking out, in one paw. In the other paw, they are holding a DVD case for a movie titled, \"This is a test\" that has a golden robot on it. In the background is a busy night club with a giant mushroom man dancing with a bear."
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4.0
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

ai-toolkit/config/examples/train_lora_chroma_24gb.yaml ADDED Viewed

	@@ -0,0 +1,104 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_chroma_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # chroma enjoys multiple resolutions
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with chroma
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new vell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for chroma, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # Download the whichever model you prefer from the Chroma repo
+        # https://huggingface.co/lodestones/Chroma/tree/main
+        # point to it here.
+        # name_or_path: "/path/to/chroma/chroma-unlocked-vVERSION.safetensors"
+        # using lodestones/Chroma will automatically use the latest version
+        name_or_path: "lodestones/Chroma"
+        # # You can also select a version of Chroma like so
+        # name_or_path: "lodestones/Chroma/v28"
+        arch: "chroma"
+        quantize: true  # run 8bit mixed precision
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # negative prompt, optional
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

ai-toolkit/config/examples/train_lora_flex2_24gb.yaml ADDED Viewed

	@@ -0,0 +1,165 @@

+# Note, Flex2 is a highly experimental WIP model. Finetuning a model with built in controls and inpainting has not
+# been done before, so you will be experimenting with me on how to do it. This is my recommended setup, but this is highly
+# subject to change as we learn more about how Flex2 works.
+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flex2_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 32
+        linear_alpha: 32
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          # Flex2 is trained with controls and inpainting. If you want the model to truely understand how the
+          # controls function with your dataset, it is a good idea to keep doing controls during training.
+          # this will automatically generate the controls for you before training. The current script is not
+          # fully optimized so this could be rather slow for large datasets, but it caches them to disk so it
+          # only needs to be done once. If you want to skip this step, you can set the controls to [] and it will
+          controls:
+            - "depth"
+            - "line"
+            - "pose"
+            - "inpaint"
+          # you can make custom inpainting images as well. These images must be webp or png format with an alpha.
+          # just erase the part of the image you want to inpaint and save it as a webp or png. Again, erase your
+          # train target. So the person if training a person. The automatic controls above with inpaint will
+          # just run a background remover mask and erase the foreground, which works well for subjects.
+          # inpaint_path: "/my/impaint/images"
+          # you can also specify existing control image pairs. It can handle multiple groups and will randomly
+          # select one for each step.
+          # control_path:
+          #   - "/my/custom/control/images"
+          #   - "/my/custom/control/images2"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          resolution: [ 512, 768, 1024 ]  # flex2 enjoys multiple resolutions
+      train:
+        batch_size: 1
+        # IMPORTANT! For Flex2, you must bypass the guidance embedder during training
+        bypass_guidance_embedding: true
+        steps: 3000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with flex2
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        # shift works well for training fast and learning composition and style.
+        # for just subject, you may want to change this to sigmoid
+        timestep_type: 'shift'  # 'linear', 'sigmoid', 'shift'
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        optimizer_params:
+          weight_decay: 1e-5
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new vell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+        # ema will smooth out learning, but could slow it down. Defaults off
+        ema_config:
+          use_ema: false
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for flex, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "ostris/Flex.2-preview"
+        arch: "flex2"
+        quantize: true  # run 8bit mixed precision
+        quantize_te: true
+        # you can pass special training infor for controls to the model here
+        # percentages are decimal based so 0.0 is 0% and 1.0 is 100% of the time.
+        model_kwargs:
+          # inverts the inpainting mask, good to learn outpainting as well, recommended 0.0 for characters
+          invert_inpaint_mask_chance: 0.5
+          # this will do a normal t2i training step without inpaint when dropped out. REcommended if you want
+          # your lora to be able to inference with and without inpainting.
+          inpaint_dropout: 0.5
+          # randomly drops out the control image. Dropout recvommended if your want it to work without controls as well.
+          control_dropout: 0.5
+          # does a random inpaint blob. Usually a good idea to keep. Without it, the model will learn to always 100%
+          # fill the inpaint area with your subject. This is not always a good thing.
+          inpaint_random_chance: 0.5
+          # generates random inpaint blobs if you did not provide an inpaint image for your dataset. Inpaint breaks down fast
+          # if you are not training with it. Controls are a little more robust and can be left out,
+          # but when in doubt, always leave this on
+          do_random_inpainting: false
+          # does random blurring of the inpaint mask. Helps prevent weird edge artifacts for real workd inpainting. Leave on.
+          random_blur_mask: true
+          # applies a small amount of random dialition and restriction to the inpaint mask. Helps with edge artifacts.
+          # Leave on.
+          random_dialate_mask: true
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+          # - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          # you can use a single inpaint or single control image on your samples.
+          # for controls, the ctrl_idx is 1, the images can be any name and image format.
+          # use either a pose/line/depth image or whatever you are training with. An example is
+          # - "photo of [trigger] --ctrl_idx 1 --ctrl_img /path/to/control/image.jpg"
+          # for an inpainting image, it must be png/webp. Erase the part of the image you want to inpaint
+          # IMPORTANT! the inpaint images must be ctrl_idx 0 and have .inpaint.{ext} in the name for this to work right.
+          # - "photo of [trigger] --ctrl_idx 0 --ctrl_img /path/to/inpaint/image.inpaint.png"
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # not used on flex2
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

ai-toolkit/config/examples/train_lora_flex_24gb.yaml ADDED Viewed

	@@ -0,0 +1,101 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flex_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # flex enjoys multiple resolutions
+      train:
+        batch_size: 1
+        # IMPORTANT! For Flex, you must bypass the guidance embedder during training
+        bypass_guidance_embedding: true
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with flex
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new vell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for flex, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "ostris/Flex.1-alpha"
+        is_flux: true
+        quantize: true  # run 8bit mixed precision
+        quantize_kwargs:
+          exclude:
+            - "*time_text_embed*"  # exclude the time text embedder from quantization
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # not used on flex
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

ai-toolkit/config/examples/train_lora_flux_24gb.yaml ADDED Viewed

	@@ -0,0 +1,96 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flux_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # flux enjoys multiple resolutions
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation_steps: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with flux
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new vell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for flux, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "black-forest-labs/FLUX.1-dev"
+        is_flux: true
+        quantize: true  # run 8bit mixed precision
+#        low_vram: true  # uncomment this if the GPU is connected to your monitors. It will use less vram to quantize, but is slower.
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # not used on flux
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 20
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

ai-toolkit/config/examples/train_lora_flux_kontext_24gb.yaml ADDED Viewed

	@@ -0,0 +1,106 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flux_kontext_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          # control path is the input images for kontext for a paired dataset. These are the source images you want to change.
+          # You can comment this out and only use normal images if you don't have a paired dataset.
+          # Control images need to match the filenames on the folder path but in
+          # a different folder. These do not need captions.
+          control_path: "/path/to/control/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          # Kontext runs images in at 2x the latent size. It may OOM at 1024 resolution with 24GB vram.
+          resolution: [ 512, 768 ]  # flux enjoys multiple resolutions
+          # resolution: [ 512, 768, 1024 ]
+      train:
+        batch_size: 1
+        steps: 3000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation_steps: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with flux
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        timestep_type: "weighted" # sigmoid, linear, or weighted.
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # ema will smooth out learning, but could slow it down.
+        # ema_config:
+        #   use_ema: true
+        #   ema_decay: 0.99
+        # will probably need this if gpu supports it for flux, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path. This model is gated.
+        # visit https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev to accept the terms and conditions
+        # and then you can use this model.
+        name_or_path: "black-forest-labs/FLUX.1-Kontext-dev"
+        arch: "flux_kontext"
+        quantize: true  # run 8bit mixed precision
+#        low_vram: true  # uncomment this if the GPU is connected to your monitors. It will use less vram to quantize, but is slower.
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+          # the --ctrl_img path is the one loaded to apply the kontext editing to
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "make the person smile  --ctrl_img /path/to/control/folder/person1.jpg"
+          - "give the person an afro  --ctrl_img /path/to/control/folder/person1.jpg"
+          - "turn this image into a cartoon  --ctrl_img /path/to/control/folder/person1.jpg"
+          - "put this person in an action film  --ctrl_img /path/to/control/folder/person1.jpg"
+          - "make this person a rapper in a rap music video  --ctrl_img /path/to/control/folder/person1.jpg"
+          - "make the person smile  --ctrl_img /path/to/control/folder/person1.jpg"
+          - "give the person an afro  --ctrl_img /path/to/control/folder/person1.jpg"
+          - "turn this image into a cartoon  --ctrl_img /path/to/control/folder/person1.jpg"
+          - "put this person in an action film  --ctrl_img /path/to/control/folder/person1.jpg"
+          - "make this person a rapper in a rap music video  --ctrl_img /path/to/control/folder/person1.jpg"
+        neg: ""  # not used on flux
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 20
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

ai-toolkit/config/examples/train_lora_flux_schnell_24gb.yaml ADDED Viewed

	@@ -0,0 +1,98 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flux_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # flux enjoys multiple resolutions
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation_steps: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with flux
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new bell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for flux, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "black-forest-labs/FLUX.1-schnell"
+        assistant_lora_path: "ostris/FLUX.1-schnell-training-adapter" # Required for flux schnell training
+        is_flux: true
+        quantize: true  # run 8bit mixed precision
+        # low_vram is painfully slow to fuse in the adapter avoid it unless absolutely necessary
+#        low_vram: true  # uncomment this if the GPU is connected to your monitors. It will use less vram to quantize, but is slower.
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # not used on flux
+        seed: 42
+        walk_seed: true
+        guidance_scale: 1  # schnell does not do guidance
+        sample_steps: 4  # 1 - 4 works well
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

ai-toolkit/config/examples/train_lora_hidream_48.yaml ADDED Viewed

	@@ -0,0 +1,112 @@

+# HiDream training is still highly experimental. The settings here will take ~35.2GB of vram to train.
+# It is not possible to train on a single 24GB card yet, but I am working on it. If you have more VRAM
+# I highly recommend first disabling quantization on the model itself if you can. You can leave the TEs quantized.
+# HiDream has a mixture of experts that may take special training considerations that I do not
+# have implemented properly. The current implementation seems to work well for LoRA training, but
+# may not be effective for longer training runs. The implementation could change in future updates
+# so your results may vary when this happens.
+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_hidream_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 32
+        linear_alpha: 32
+        network_kwargs:
+          # it is probably best to ignore the mixture of experts since only 2 are active each block. It works activating it, but I wouldnt.
+          # proper training of it is not fully implemented
+          ignore_if_contains:
+            - "ff_i.experts"
+            - "ff_i.gate"
+      save:
+        dtype: bfloat16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          resolution: [ 512, 768, 1024 ]  # hidream enjoys multiple resolutions
+      train:
+        batch_size: 1
+        steps: 3000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation_steps: 1
+        train_unet: true
+        train_text_encoder: false  # wont work with hidream
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        timestep_type: shift # sigmoid, shift, linear
+        optimizer: "adamw8bit"
+        lr: 2e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new vell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+        # ema will smooth out learning, but could slow it down. Defaults off
+        ema_config:
+          use_ema: false
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for hidream, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # the transformer will get grabbed from this hf repo
+        # warning ONLY train on Full. The dev and fast models are distilled and will break
+        name_or_path: "HiDream-ai/HiDream-I1-Full"
+        # the extras will be grabbed from this hf repo. (text encoder, vae)
+        extras_name_or_path: "HiDream-ai/HiDream-I1-Full"
+        arch: "hidream"
+        # both need to be quantized to train on 48GB currently
+        quantize: true
+        quantize_te: true
+        model_kwargs:
+          # llama is a gated model, It defaults to unsloth version, but you can set the llama path here
+          llama_model_path: "unsloth/Meta-Llama-3.1-8B-Instruct"
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

ai-toolkit/config/examples/train_lora_lumina.yaml ADDED Viewed

	@@ -0,0 +1,96 @@

+---
+# This configuration requires 20GB of VRAM or more to operate
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_lumina_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+      # performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+      # trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: bf16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 2 # how many intermittent saves to keep
+        save_format: 'diffusers' # 'diffusers'
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          # cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # lumina2 enjoys multiple resolutions
+      train:
+        batch_size: 1
+        # can be 'sigmoid', 'linear', or 'lumina2_shift'
+        timestep_type: 'lumina2_shift'
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with lumina2
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+        # skip_first_sample: true
+        # uncomment to completely disable sampling
+        # disable_sampling: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on if you have the vram
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for lumina2, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "Alpha-VLLM/Lumina-Image-2.0"
+        is_lumina2: true # lumina2 architecture
+        # you can quantize just the Gemma2 text encoder here to save vram
+        quantize_te: true
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+          # - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a cat that is half black and half orange tabby, split down the middle. The cat has on a blue tophat. They are holding a martini glass with a pink ball of yarn in it with green knitting needles sticking out, in one paw. In the other paw, they are holding a DVD case for a movie titled, \"This is a test\" that has a golden robot on it. In the background is a busy night club with a giant mushroom man dancing with a bear."
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4.0
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

ai-toolkit/config/examples/train_lora_omnigen2_24gb.yaml ADDED Viewed

	@@ -0,0 +1,94 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_omnigen2_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # omnigen2 should work with multiple resolutions
+      train:
+        batch_size: 1
+        steps: 3000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with omnigen2
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        timestep_type: 'sigmoid' # sigmoid, linear, shift
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # ema will smooth out learning, but could slow it down.
+        # ema_config:
+        #   use_ema: true
+        #   ema_decay: 0.99
+        # will probably need this if gpu supports it for omnigen2, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        name_or_path: "OmniGen2/OmniGen2
+        arch: "omnigen2"
+        quantize_te: true  # quantize_only te
+        # quantize: true  # quantize transformer
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # negative prompt, optional
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

ai-toolkit/config/examples/train_lora_qwen_image_24gb.yaml ADDED Viewed

	@@ -0,0 +1,95 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_qwen_image_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+      # Trigger words will not work when caching text embeddings
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          # default_caption: "a person" # if caching text embeddings, if you dont have captions, this will get cached
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you have a large dataset
+          # if you OOM, 1024 may be too much, but should work
+          resolution: [ 512, 768, 1024 ]  # qwen image enjoys multiple resolutions
+      train:
+        batch_size: 1
+        # caching text embeddings is required for 24GB
+        cache_text_embeddings: true
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with qwen image
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "Qwen/Qwen-Image"
+        arch: "qwen_image"
+        quantize: true
+        # qtype_te: "qfloat8" Default float8 qquantization
+        # to use the ARA use the | pipe to point to hf path, or a local path if you have one.
+        # 3bit is required for 24GB
+        qtype: "uint3|ostris/accuracy_recovery_adapters/qwen_image_torchao_uint3.safetensors"
+        quantize_te: true
+        qtype_te: "qfloat8"
+        low_vram: true
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+          # - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 3
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

ai-toolkit/config/examples/train_lora_qwen_image_edit_2509_32gb.yaml ADDED Viewed

	@@ -0,0 +1,105 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_qwen_image_edit_2509_lora_v1"
+  process:
+    - type: 'diffusion_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          # can do up to 3 control image folders, file names must match target file names, but aspect/size can be different
+          control_path:
+            - "/path/to/control/images/folder1"
+            - "/path/to/control/images/folder2"
+            - "/path/to/control/images/folder3"
+          caption_ext: "txt"
+          # default_caption: "a person" # if caching text embeddings, if you don't have captions, this will get cached
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          resolution: [ 512, 768, 1024 ]  # qwen image enjoys multiple resolutions
+          # a trigger word that can be cached with the text embeddings
+          # trigger_word: "optional trigger word"
+      train:
+        batch_size: 1
+        # caching text embeddings is required for 32GB
+        cache_text_embeddings: true
+        # unload_text_encoder: true
+        steps: 3000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        timestep_type: "weighted"
+        train_unet: true
+        train_text_encoder: false  # probably won't work with qwen image
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+        # skip_first_sample: true
+        # uncomment to completely disable sampling
+        # disable_sampling: true
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "Qwen/Qwen-Image-Edit-2509"
+        arch: "qwen_image_edit_plus"
+        quantize: true
+        # to use the ARA use the | pipe to point to hf path, or a local path if you have one.
+        # 3bit is required for 32GB
+        qtype: "uint3|ostris/accuracy_recovery_adapters/qwen_image_edit_2509_torchao_uint3.safetensors"
+        quantize_te: true
+        qtype_te: "qfloat8"
+        low_vram: true
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        # you can provide up to 3 control images here
+        samples:
+          - prompt: "Do whatever with Image1 and Image2"
+            ctrl_img_1: "/path/to/image1.png"
+            ctrl_img_2: "/path/to/image2.png"
+            # ctrl_img_3: "/path/to/image3.png"
+          - prompt: "Do whatever with Image1 and Image2"
+            ctrl_img_1: "/path/to/image1.png"
+            ctrl_img_2: "/path/to/image2.png"
+            # ctrl_img_3: "/path/to/image3.png"
+          - prompt: "Do whatever with Image1 and Image2"
+            ctrl_img_1: "/path/to/image1.png"
+            ctrl_img_2: "/path/to/image2.png"
+            # ctrl_img_3: "/path/to/image3.png"
+          - prompt: "Do whatever with Image1 and Image2"
+            ctrl_img_1: "/path/to/image1.png"
+            ctrl_img_2: "/path/to/image2.png"
+            # ctrl_img_3: "/path/to/image3.png"
+          - prompt: "Do whatever with Image1 and Image2"
+            ctrl_img_1: "/path/to/image1.png"
+            ctrl_img_2: "/path/to/image2.png"
+            # ctrl_img_3: "/path/to/image3.png"
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 3
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

ai-toolkit/config/examples/train_lora_qwen_image_edit_32gb.yaml ADDED Viewed

	@@ -0,0 +1,102 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_qwen_image_edit_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+      # Trigger words will not work when caching text embeddings
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          control_path: "/path/to/control/images/folder"
+          caption_ext: "txt"
+          # default_caption: "a person" # if caching text embeddings, if you don't have captions, this will get cached
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          resolution: [ 512, 768, 1024 ]  # qwen image enjoys multiple resolutions
+      train:
+        batch_size: 1
+        # caching text embeddings is required for 32GB
+        cache_text_embeddings: true
+        steps: 3000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        timestep_type: "weighted"
+        train_unet: true
+        train_text_encoder: false  # probably won't work with qwen image
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "Qwen/Qwen-Image-Edit"
+        arch: "qwen_image_edit"
+        quantize: true
+        # qtype_te: "qfloat8" Default float8 qquantization
+        # to use the ARA use the | pipe to point to hf path, or a local path if you have one.
+        # 3bit is required for 32GB
+        qtype: "uint3|qwen_image_edit_torchao_uint3.safetensors"
+        quantize_te: true
+        qtype_te: "qfloat8"
+        low_vram: true
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        samples:
+          - prompt: "do the thing to it"
+            ctrl_img: "/path/to/control/image.jpg"
+          - prompt: "do the thing to it"
+            ctrl_img: "/path/to/control/image.jpg"
+          - prompt: "do the thing to it"
+            ctrl_img: "/path/to/control/image.jpg"
+          - prompt: "do the thing to it"
+            ctrl_img: "/path/to/control/image.jpg"
+          - prompt: "do the thing to it"
+            ctrl_img: "/path/to/control/image.jpg"
+          - prompt: "do the thing to it"
+            ctrl_img: "/path/to/control/image.jpg"
+          - prompt: "do the thing to it"
+            ctrl_img: "/path/to/control/image.jpg"
+          - prompt: "do the thing to it"
+            ctrl_img: "/path/to/control/image.jpg"
+          - prompt: "do the thing to it"
+            ctrl_img: "/path/to/control/image.jpg"
+          - prompt: "do the thing to it"
+            ctrl_img: "/path/to/control/image.jpg"
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 3
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

ai-toolkit/config/examples/train_lora_sd35_large_24gb.yaml ADDED Viewed

	@@ -0,0 +1,97 @@

+---
+# NOTE!! THIS IS CURRENTLY EXPERIMENTAL AND UNDER DEVELOPMENT. SOME THINGS WILL CHANGE
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_sd3l_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 1024 ]
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation_steps: 1
+        train_unet: true
+        train_text_encoder: false  # May not fully work with SD3 yet
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch"
+        timestep_type: "linear" # linear or sigmoid
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new vell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for sd3, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "stabilityai/stable-diffusion-3.5-large"
+        is_v3: true
+        quantize: true  # run 8bit mixed precision
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

ai-toolkit/config/examples/train_lora_wan21_14b_24gb.yaml ADDED Viewed

	@@ -0,0 +1,101 @@

+# IMPORTANT: The Wan2.1 14B model is huge. This config should work on 24GB GPUs. It cannot
+# support keeping the text encoder on GPU while training with 24GB, so it is only good
+# for training on a single prompt, for example a person with a trigger word.
+# to train on captions, you need more vran for now.
+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_wan21_14b_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+      # this is probably needed for 24GB cards when offloading TE to CPU
+      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 32
+        linear_alpha: 32
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        # AI-Toolkit does not currently support video datasets, we will train on 1 frame at a time
+        # it works well for characters, but not as well for "actions"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 632 ]  # will be around 480p
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with wan
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        timestep_type: 'sigmoid'
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        optimizer_params:
+          weight_decay: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        dtype: bf16
+        # required for 24GB cards
+        # this will encode your trigger word and use those embeddings for every image in the dataset
+        unload_text_encoder: true
+      model:
+        # huggingface model name or path
+        name_or_path: "Wan-AI/Wan2.1-T2V-14B-Diffusers"
+        arch: 'wan21'
+        # these settings will save as much vram as possible
+        quantize: true
+        quantize_te: true
+        low_vram: true
+      sample:
+        sampler: "flowmatch"
+        sample_every: 250 # sample every this many steps
+        width: 832
+        height: 480
+        num_frames: 40
+        fps: 15
+        # samples take a long time. so use them sparingly
+        # samples will be animated webp files, if you don't see them animated, open in a browser.
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 5
+        sample_steps: 30
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

ai-toolkit/config/examples/train_lora_wan21_1b_24gb.yaml ADDED Viewed

	@@ -0,0 +1,90 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_wan21_1b_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 32
+        linear_alpha: 32
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        # AI-Toolkit does not currently support video datasets, we will train on 1 frame at a time
+        # it works well for characters, but not as well for "actions"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 632 ]  # will be around 480p
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with wan
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        timestep_type: 'sigmoid'
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        optimizer_params:
+          weight_decay: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"
+        arch: 'wan21'
+        quantize_te: true # saves vram
+      sample:
+        sampler: "flowmatch"
+        sample_every: 250 # sample every this many steps
+        width: 832
+        height: 480
+        num_frames: 40
+        fps: 15
+        # samples take a long time. so use them sparingly
+        # samples will be animated webp files, if you don't see them animated, open in a browser.
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 5
+        sample_steps: 30
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

ai-toolkit/config/examples/train_lora_wan22_14b_24gb.yaml ADDED Viewed

	@@ -0,0 +1,111 @@

+# this example focuses mainly for training Wan2.2 14b on images. It will work for video as well by increasing
+# the number of frames in the dataset and samples. Training on and generating video is very VRAM intensive.
+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_wan22_14b_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # Use a trigger word if train.unload_text_encoder is true, however, if caching text embeddings, do not use a trigger word
+      # trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 32
+        linear_alpha: 32
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt.
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/or/video/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          # number of frames to extract from your video. It will automatically extract them evenly spaced
+          # set to 1 frame for images
+          num_frames: 1
+          resolution: [ 512, 768, 1024]
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with wan
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        timestep_type: 'linear'
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        optimizer_params:
+          weight_decay: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        dtype: bf16
+        # IMPORTANT: this is for Wan 2.2 MOE. It will switch training one stage or the other every this many steps
+        switch_boundary_every: 10
+        # required for 24GB cards. You must do either unload_text_encoder or cache_text_embeddings but not both
+        # this will encode your trigger word and use those embeddings for every image in the dataset, captions will be ignored
+        # unload_text_encoder: true
+        # this will cache all captions in your dataset.
+        cache_text_embeddings: true
+      model:
+        # huggingface model name or path, this one if bf16, vs the float32 of the official repo
+        name_or_path: "ai-toolkit/Wan2.2-T2V-A14B-Diffusers-bf16"
+        arch: 'wan22_14b'
+        quantize: true
+        # This will pull and use a custom Accuracy Recovery Adapter to train at 4bit
+        qtype: "uint4|ostris/accuracy_recovery_adapters/wan22_14b_t2i_torchao_uint4.safetensors"
+        quantize_te: true
+        qtype_te: "qfloat8"
+        low_vram: true
+        model_kwargs:
+          # you can train high noise, low noise, or both. With low vram it will automatically unload the one not being trained.
+          train_high_noise: true
+          train_low_noise: true
+      sample:
+        sampler: "flowmatch"
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        # set to 1 for images
+        num_frames: 1
+        fps: 16
+        # samples take a long time. so use them sparingly
+        # samples will be animated webp files, if you don't see them animated, open in a browser.
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 3.5
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

ai-toolkit/config/examples/train_slider.example.yml ADDED Viewed

	@@ -0,0 +1,230 @@

+---
+# This is in yaml format. You can use json if you prefer
+# I like both but yaml is easier to write
+# Plus it has comments which is nice for documentation
+# This is the config I use on my sliders, It is solid and tested
+job: train
+config:
+  # the name will be used to create a folder in the output folder
+  # it will also replace any [name] token in the rest of this config
+  name: detail_slider_v1
+  # folder will be created with name above in folder below
+  # it can be relative to the project root or absolute
+  training_folder: "output/LoRA"
+  device: cuda:0 # cpu, cuda:0, etc
+  # for tensorboard logging, we will make a subfolder for this job
+  log_dir: "output/.tensorboard"
+  # you can stack processes for other jobs, It is not tested with sliders though
+  # just use one for now
+  process:
+    - type: slider # tells runner to run the slider process
+      # network is the LoRA network for a slider, I recommend to leave this be
+      network:
+        # network type lierla is traditional LoRA that works everywhere, only linear layers
+        type: "lierla"
+        # rank / dim of the network. Bigger is not always better. Especially for sliders. 8 is good
+        linear: 8
+        linear_alpha: 4 # Do about half of rank
+      # training config
+      train:
+        # this is also used in sampling. Stick with ddpm unless you know what you are doing
+        noise_scheduler: "ddpm" # or "ddpm", "lms", "euler_a"
+        # how many steps to train. More is not always better. I rarely go over 1000
+        steps: 500
+        # I have had good results with 4e-4 to 1e-4 at 500 steps
+        lr: 2e-4
+        # enables gradient checkpoint, saves vram, leave it on
+        gradient_checkpointing: true
+        # train the unet. I recommend leaving this true
+        train_unet: true
+        # train the text encoder. I don't recommend this unless you have a special use case
+        # for sliders we are adjusting representation of the concept (unet),
+        # not the description of it (text encoder)
+        train_text_encoder: false
+        # same as from sd-scripts, not fully tested but should speed up training
+        min_snr_gamma: 5.0
+        # just leave unless you know what you are doing
+        # also supports "dadaptation" but set lr to 1 if you use that,
+        # but it learns too fast and I don't recommend it
+        optimizer: "adamw"
+        # only constant for now
+        lr_scheduler: "constant"
+        # we randomly denoise random num of steps form 1 to this number
+        # while training. Just leave it
+        max_denoising_steps: 40
+        # works great at 1. I do 1 even with my 4090.
+        # higher may not work right with newer single batch stacking code anyway
+        batch_size: 1
+        # bf16 works best if your GPU supports it (modern)
+        dtype: bf16  # fp32, bf16, fp16
+        # if you have it, use it. It is faster and better
+        # torch 2.0 doesnt need xformers anymore, only use if you have lower version
+#        xformers: true
+        # I don't recommend using unless you are trying to make a darker lora. Then do 0.1 MAX
+        # although, the way we train sliders is comparative, so it probably won't work anyway
+        noise_offset: 0.0
+#        noise_offset: 0.0357  # SDXL was trained with offset of 0.0357. So use that when training on SDXL
+      # the model to train the LoRA network on
+      model:
+        # huggingface name, relative prom project path, or absolute path to .safetensors or .ckpt
+        name_or_path: "runwayml/stable-diffusion-v1-5"
+        is_v2: false  # for v2 models
+        is_v_pred: false # for v-prediction models (most v2 models)
+        # has some issues with the dual text encoder and the way we train sliders
+        # it works bit weights need to probably be higher to see it.
+        is_xl: false  # for SDXL models
+      # saving config
+      save:
+        dtype: float16 # precision to save. I recommend float16
+        save_every: 50 # save every this many steps
+        # this will remove step counts more than this number
+        # allows you to save more often in case of a crash without filling up your drive
+        max_step_saves_to_keep: 2
+      # sampling config
+      sample:
+        # must match train.noise_scheduler, this is not used here
+        # but may be in future and in other processes
+        sampler: "ddpm"
+        # sample every this many steps
+        sample_every: 20
+        # image size
+        width: 512
+        height: 512
+        # prompts to use for sampling. Do as many as you want, but it slows down training
+        # pick ones that will best represent the concept you are trying to adjust
+        # allows some flags after the prompt
+        #  --m [number]  # network multiplier. LoRA weight. -3 for the negative slide, 3 for the positive
+        #      slide are good tests. will inherit sample.network_multiplier if not set
+        #  --n [string]  # negative prompt, will inherit sample.neg if not set
+        # Only 75 tokens allowed currently
+        # I like to do a wide positive and negative spread so I can see a good range and stop
+        # early if the network is braking down
+        prompts:
+          - "a woman in a coffee shop, black hat, blonde hair, blue jacket --m -5"
+          - "a woman in a coffee shop, black hat, blonde hair, blue jacket --m -3"
+          - "a woman in a coffee shop, black hat, blonde hair, blue jacket --m 3"
+          - "a woman in a coffee shop, black hat, blonde hair, blue jacket --m 5"
+          - "a golden retriever sitting on a leather couch, --m -5"
+          - "a golden retriever sitting on a leather couch --m -3"
+          - "a golden retriever sitting on a leather couch --m 3"
+          - "a golden retriever sitting on a leather couch --m 5"
+          - "a man with a beard and red flannel shirt, wearing vr goggles, walking into traffic --m -5"
+          - "a man with a beard and red flannel shirt, wearing vr goggles, walking into traffic --m -3"
+          - "a man with a beard and red flannel shirt, wearing vr goggles, walking into traffic --m 3"
+          - "a man with a beard and red flannel shirt, wearing vr goggles, walking into traffic --m 5"
+        # negative prompt used on all prompts above as default if they don't have one
+        neg: "cartoon, fake, drawing, illustration, cgi, animated, anime, monochrome"
+        # seed for sampling. 42 is the answer for everything
+        seed: 42
+        # walks the seed so s1 is 42, s2 is 43, s3 is 44, etc
+        # will start over on next sample_every so s1 is always seed
+        # works well if you use same prompt but want different results
+        walk_seed: false
+        # cfg scale (4 to 10 is good)
+        guidance_scale: 7
+        # sampler steps (20 to 30 is good)
+        sample_steps: 20
+        # default network multiplier for all prompts
+        # since we are training a slider, I recommend overriding this with --m [number]
+        # in the prompts above to get both sides of the slider
+        network_multiplier: 1.0
+      # logging information
+      logging:
+        log_every: 10 # log every this many steps
+        use_wandb: false # not supported yet
+        verbose: false # probably done need unless you are debugging
+      # slider training config, best for last
+      slider:
+        # resolutions to train on. [ width, height ]. This is less important for sliders
+        # as we are not teaching the model anything it doesn't already know
+        # but must be a size it understands [ 512, 512 ] for sd_v1.5  and [ 768, 768 ] for sd_v2.1
+        # and [ 1024, 1024 ] for sd_xl
+        # you can do as many as you want here
+        resolutions:
+          - [ 512, 512 ]
+#          - [ 512, 768 ]
+#          - [ 768, 768 ]
+        # slider training uses 4 combined steps for a single round. This will do it in one gradient
+        # step. It is highly optimized and shouldn't take anymore vram than doing without it,
+        # since we break down batches for gradient accumulation now. so just leave it on.
+        batch_full_slide: true
+        # These are the concepts to train on. You can do as many as you want here,
+        # but they can conflict outweigh each other. Other than experimenting, I recommend
+        # just doing one for good results
+        targets:
+            # target_class is the base concept we are adjusting the representation of
+            # for example, if we are adjusting the representation of a person, we would use "person"
+            # if we are adjusting the representation of a cat, we would use "cat" It is not
+            # a keyword necessarily but what the model understands the concept to represent.
+            # "person" will affect men, women, children, etc but will not affect cats, dogs, etc
+            # it is the models base general understanding of the concept and everything it represents
+            # you can leave it blank to affect everything. In this example, we are adjusting
+            # detail, so we will leave it blank to affect everything
+          - target_class: ""
+            # positive is the prompt for the positive side of the slider.
+            # It is the concept that will be excited and amplified in the model when we slide the slider
+            # to the positive side and forgotten / inverted when we slide
+            # the slider to the negative side. It is generally best to include the target_class in
+            # the prompt. You want it to be the extreme of what you want to train on. For example,
+            # if you want to train on fat people, you would use "an extremely fat, morbidly obese person"
+            # as the prompt. Not just "fat person"
+            # max 75 tokens for now
+            positive: "high detail, 8k, intricate, detailed, high resolution, high res, high quality"
+            # negative is the prompt for the negative side of the slider and works the same as positive
+            # it does not necessarily work the same as a negative prompt when generating images
+            # these need to be polar opposites.
+            # max 76 tokens for now
+            negative: "blurry, boring, fuzzy, low detail, low resolution, low res, low quality"
+            # the loss for this target is multiplied by this number.
+            # if you are doing more than one target it may be good to set less important ones
+            # to a lower number like 0.1 so they don't outweigh the primary target
+            weight: 1.0
+            # shuffle the prompts split by the comma. We will run every combination randomly
+            # this will make the LoRA more robust. You probably want this on unless prompt order
+            # is important for some reason
+            shuffle: true
+        # anchors are prompts that we will try to hold on to while training the slider
+        # these are NOT necessary and can prevent the slider from converging if not done right
+        # leave them off if you are having issues, but they can help lock the network
+        # on certain concepts to help prevent catastrophic forgetting
+        # you want these to generate an image that is not your target_class, but close to it
+        # is fine as long as it does not directly overlap it.
+        # For example, if you are training on a person smiling,
+        # you could use "a person with a face mask" as an anchor. It is a person, the image is the same
+        # regardless if they are smiling or not, however, the closer the concept is to the target_class
+        # the less the multiplier needs to be. Keep multipliers less than 1.0 for anchors usually
+        # for close concepts, you want to be closer to 0.1 or 0.2
+        # these will slow down training. I am leaving them off for the demo
+#        anchors:
+#          - prompt: "a woman"
+#            neg_prompt: "animal"
+#            # the multiplier applied to the LoRA when this is run.
+#            # higher will give it more weight but also help keep the lora from collapsing
+#            multiplier: 1.0
+#          - prompt: "a man"
+#            neg_prompt: "animal"
+#            multiplier: 1.0
+#          - prompt: "a person"
+#            neg_prompt: "animal"
+#            multiplier: 1.0
+# You can put any information you want here, and it will be saved in the model.
+# The below is an example, but you can put your grocery list in it if you want.
+# It is saved in the model so be aware of that. The software will include this
+# plus some other information for you automatically
+meta:
+  # [name] gets replaced with the name above
+  name: "[name]"
+#  version: '1.0'
+#  creator:
+#    name: Your Name
+#    email: your@gmail.com
+#    website: https://your.website

ai-toolkit/dgx_instructions.md ADDED Viewed

	@@ -0,0 +1,84 @@

+# AI Toolkit by Ostris
+## DGX OS installation instructions
+You need to use Python 3.11 to run AI Toolkit on DGX OS. The easiest way to do this without affecting the system installation of Python is to create a virtual environment with **miniconda**, which allows you to specify the version of Python to use in the environment.
+This guide will assume you have a fresh installation of DGX OS, and will guide you through the installation of all requirements.
+### Installation instructions for DGX OS:
+**1) Get Python 3.11 (via miniconda)**
+Install the latest version of miniconda:
+```
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-aarch64.sh
+chmod u+x Miniconda3-latest-Linux-aarch64.sh
+./Miniconda3-latest-Linux-aarch64.sh
+```
+Restart your bash or ssh session. If miniconda was installed successfully, it will automatically load the 'base' environment by default. If you want to disable this behaviour, run:
+```
+conda config --set auto_activate_base false
+```
+Now you can create a Python 3.11 environment for ai-toolkit:
+```
+conda create --name ai-toolkit python=3.11
+```
+Then activate the environment with:
+```
+conda activate ai-toolkit
+```
+**2) Install PyTorch**
+```
+pip3 install torch==2.9.1 torchvision==0.24.1 torchaudio==2.9.1 --index-url https://download.pytorch.org/whl/cu130
+```
+**3) Install the remaining requirements (dgx_requirements.txt)**
+```
+pip3 install -r dgx_requirements.txt
+```
+### Running the UI on DGX OS:
+Running the UI is not that different from doing it on other systems, however, you need to install the ARM64 version of NodeJS for Linux, which is compatible with the NVIDIA Grace CPU.
+**1) Install Node.js**
+Download a Linux ARM64 build of Node.js from: https://nodejs.org (for example: https://nodejs.org/dist/v24.11.1/node-v24.11.1-linux-arm64.tar.xz)
+Extract it and add the bin directory to your path. I extracted it to **/opt** and added the following to my ~/.bashrc file:
+```
+export PATH=“/opt/node-v24.11.1-linux-arm64/bin:$PATH”
+```
+**2) Compile and run the Node.js UI**
+Change to the ui directory, then build and run the UI:
+```
+cd ui
+npm run build_and_start
+```
+If all went well, you’ll be able to access the UI on port 8675 and start training.
+<details>
+  <summary>Troubleshooting issues</summary>
+If you’re not getting any output when starting a training job from the UI, it’s probably crashing before the process started, the best way to debug these issues is to run the python training script directly (which is normally started by the UI). To do this, set up a training job in the UI, go to the advanced config screen, copy and paste the configuration into a file like train.yaml, then run the training script like this with the conda virtual environment active:
+```
+python run.py path/to/train.yaml
+```
+</details>
+<br>

ai-toolkit/dgx_requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+# You need to use Python 3.11, the easiest way to get this on DGX OS without impacting the system version of Python is to create an environment with miniconda.
+# specific dependency versions needed on DGX OS devices:
+scipy==1.16.0
+tifffile==2025.6.11
+imageio==2.37.0
+scikit_image==0.25.2
+clean_fid==0.1.35
+pywavelets==1.9.0
+contourpy==1.3.3
+opencv_python_headless==4.11.0.86
+-r requirements_base.txt

ai-toolkit/docker-compose.yml ADDED Viewed

	@@ -0,0 +1,25 @@

+version: "3.8"
+services:
+  ai-toolkit:
+    image: ostris/aitoolkit:latest
+    restart: unless-stopped
+    ports:
+      - "8675:8675"
+    volumes:
+      - ~/.cache/huggingface/hub:/root/.cache/huggingface/hub
+      - ./aitk_db.db:/app/ai-toolkit/aitk_db.db
+      - ./datasets:/app/ai-toolkit/datasets
+      - ./output:/app/ai-toolkit/output
+      - ./config:/app/ai-toolkit/config
+    environment:
+      - AI_TOOLKIT_AUTH=${AI_TOOLKIT_AUTH:-password}
+      - NODE_ENV=production
+      - TZ=UTC
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]

ai-toolkit/docker/Dockerfile ADDED Viewed

	@@ -0,0 +1,108 @@

+FROM nvidia/cuda:12.8.1-devel-ubuntu24.04
+LABEL authors="jaret"
+# Set noninteractive to avoid timezone prompts
+ENV DEBIAN_FRONTEND=noninteractive
+# ref https://en.wikipedia.org/wiki/CUDA
+ENV TORCH_CUDA_ARCH_LIST="8.0 8.6 8.9 9.0 10.0 12.0"
+# Install dependencies
+RUN apt-get update && apt-get install --no-install-recommends -y \
+    git \
+    curl \
+    build-essential \
+    cmake \
+    wget \
+    python3.12 \
+    python3-pip \
+    python3-dev \
+    python3-setuptools \
+    python3-wheel \
+    python3-venv \
+    ffmpeg \
+    tmux \
+    htop \
+    nvtop \
+    python3-opencv \
+    openssh-client \
+    openssh-server \
+    openssl \
+    rsync \
+    unzip \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+# Install nodejs
+WORKDIR /tmp
+RUN curl -sL https://deb.nodesource.com/setup_23.x -o nodesource_setup.sh && \
+    bash nodesource_setup.sh && \
+    apt-get update && \
+    apt-get install -y nodejs && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Set aliases for python and pip
+RUN ln -s /usr/bin/python3 /usr/bin/python
+# install pytorch before cache bust to avoid redownloading pytorch
+RUN pip install --no-cache-dir torch==2.9.1 torchvision==0.24.1 torchaudio==2.9.1 --index-url https://download.pytorch.org/whl/cu128 --break-system-packages
+WORKDIR /app/ai-toolkit
+# ---------------------------------------------------------------------------- #
+# Dependency layers come BEFORE the source clone so they are only rebuilt (and
+# only need to be re-pulled by servers) when the dependency manifests change,
+# not on every code change.
+# ---------------------------------------------------------------------------- #
+# Install Python dependencies (only re-runs when the requirements files change)
+COPY requirements.txt requirements_base.txt /app/ai-toolkit/
+RUN pip install --no-cache-dir --break-system-packages -r requirements.txt && \
+    pip install setuptools==69.5.1 --no-cache-dir --break-system-packages
+# Install Node dependencies (only re-runs when package.json / package-lock.json change)
+COPY ui/package.json ui/package-lock.json /app/ai-toolkit/ui/
+RUN cd /app/ai-toolkit/ui && npm ci
+# ---------------------------------------------------------------------------- #
+# Source code comes LAST. Only this layer (plus the UI build below) is rebuilt
+# on a code change, so servers only re-pull the (small) source, not the deps.
+# Clone to a temp dir and rsync the source in, preserving the dependency dirs
+# already populated above (ui/node_modules) and the manifests already used.
+# ---------------------------------------------------------------------------- #
+ARG CACHEBUST=1234
+ARG GIT_COMMIT=main
+RUN echo "Cache bust: ${CACHEBUST}" && \
+    git clone https://github.com/ostris/ai-toolkit.git /tmp/ai-toolkit-src && \
+    cd /tmp/ai-toolkit-src && \
+    git checkout ${GIT_COMMIT} && \
+    rsync -a --delete \
+        --exclude 'ui/node_modules' \
+        --exclude 'requirements.txt' \
+        --exclude 'ui/package.json' \
+        --exclude 'ui/package-lock.json' \
+        /tmp/ai-toolkit-src/ /app/ai-toolkit/ && \
+    rm -rf /tmp/ai-toolkit-src
+# Build UI (re-runs on code change, but reuses the cached node_modules above).
+# update_db runs first because it does `prisma generate`, which creates the
+# @prisma/client types the TS build needs. In the old layout generate happened
+# as a side effect of npm install seeing the schema; now the source arrives
+# after npm ci, so run it explicitly before the build.
+RUN cd /app/ai-toolkit/ui && \
+    npm run update_db && \
+    npm run build
+# Expose port (assuming the application runs on port 3000)
+EXPOSE 8675
+WORKDIR /
+COPY docker/start.sh /start.sh
+RUN chmod +x /start.sh
+CMD ["/start.sh"]

ai-toolkit/docker/start.sh ADDED Viewed

	@@ -0,0 +1,70 @@

+#!/bin/bash
+set -e  # Exit the script if any statement returns a non-true return value
+# ref https://github.com/runpod/containers/blob/main/container-template/start.sh
+# ---------------------------------------------------------------------------- #
+#                          Function Definitions                                #
+# ---------------------------------------------------------------------------- #
+# Setup ssh
+setup_ssh() {
+    if [[ $PUBLIC_KEY ]]; then
+        echo "Setting up SSH..."
+        mkdir -p ~/.ssh
+        echo "$PUBLIC_KEY" >> ~/.ssh/authorized_keys
+        chmod 700 -R ~/.ssh
+         if [ ! -f /etc/ssh/ssh_host_rsa_key ]; then
+            ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key -q -N ''
+            echo "RSA key fingerprint:"
+            ssh-keygen -lf /etc/ssh/ssh_host_rsa_key.pub
+        fi
+        if [ ! -f /etc/ssh/ssh_host_dsa_key ]; then
+            ssh-keygen -t dsa -f /etc/ssh/ssh_host_dsa_key -q -N ''
+            echo "DSA key fingerprint:"
+            ssh-keygen -lf /etc/ssh/ssh_host_dsa_key.pub
+        fi
+        if [ ! -f /etc/ssh/ssh_host_ecdsa_key ]; then
+            ssh-keygen -t ecdsa -f /etc/ssh/ssh_host_ecdsa_key -q -N ''
+            echo "ECDSA key fingerprint:"
+            ssh-keygen -lf /etc/ssh/ssh_host_ecdsa_key.pub
+        fi
+        if [ ! -f /etc/ssh/ssh_host_ed25519_key ]; then
+            ssh-keygen -t ed25519 -f /etc/ssh/ssh_host_ed25519_key -q -N ''
+            echo "ED25519 key fingerprint:"
+            ssh-keygen -lf /etc/ssh/ssh_host_ed25519_key.pub
+        fi
+        service ssh start
+        echo "SSH host keys:"
+        for key in /etc/ssh/*.pub; do
+            echo "Key: $key"
+            ssh-keygen -lf $key
+        done
+    fi
+}
+# Export env vars
+export_env_vars() {
+    echo "Exporting environment variables..."
+    printenv | grep -E '^RUNPOD_|^PATH=|^_=' | awk -F = '{ print "export " $1 "=\"" $2 "\"" }' >> /etc/rp_environment
+    echo 'source /etc/rp_environment' >> ~/.bashrc
+}
+# ---------------------------------------------------------------------------- #
+#                               Main Program                                   #
+# ---------------------------------------------------------------------------- #
+echo "Pod Started"
+setup_ssh
+export_env_vars
+echo "Starting AI Toolkit UI..."
+cd /app/ai-toolkit/ui && npm run start

ai-toolkit/extensions/example/ExampleMergeModels.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import torch
+import gc
+from collections import OrderedDict
+from typing import TYPE_CHECKING
+from jobs.process import BaseExtensionProcess
+from toolkit.config_modules import ModelConfig
+from toolkit.stable_diffusion_model import StableDiffusion
+from toolkit.train_tools import get_torch_dtype
+from tqdm import tqdm
+# Type check imports. Prevents circular imports
+if TYPE_CHECKING:
+    from jobs import ExtensionJob
+# extend standard config classes to add weight
+class ModelInputConfig(ModelConfig):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.weight = kwargs.get('weight', 1.0)
+        # overwrite default dtype unless user specifies otherwise
+        # float 32 will give up better precision on the merging functions
+        self.dtype: str = kwargs.get('dtype', 'float32')
+def flush():
+    torch.cuda.empty_cache()
+    gc.collect()
+# this is our main class process
+class ExampleMergeModels(BaseExtensionProcess):
+    def __init__(
+            self,
+            process_id: int,
+            job: 'ExtensionJob',
+            config: OrderedDict
+    ):
+        super().__init__(process_id, job, config)
+        # this is the setup process, do not do process intensive stuff here, just variable setup and
+        # checking requirements. This is called before the run() function
+        # no loading models or anything like that, it is just for setting up the process
+        # all of your process intensive stuff should be done in the run() function
+        # config will have everything from the process item in the config file
+        # convince methods exist on BaseProcess to get config values
+        # if required is set to true and the value is not found it will throw an error
+        # you can pass a default value to get_conf() as well if it was not in the config file
+        # as well as a type to cast the value to
+        self.save_path = self.get_conf('save_path', required=True)
+        self.save_dtype = self.get_conf('save_dtype', default='float16', as_type=get_torch_dtype)
+        self.device = self.get_conf('device', default='cpu', as_type=torch.device)
+        # build models to merge list
+        models_to_merge = self.get_conf('models_to_merge', required=True, as_type=list)
+        # build list of ModelInputConfig objects. I find it is a good idea to make a class for each config
+        # this way you can add methods to it and it is easier to read and code. There are a lot of
+        # inbuilt config classes located in toolkit.config_modules as well
+        self.models_to_merge = [ModelInputConfig(**model) for model in models_to_merge]
+        # setup is complete. Don't load anything else here, just setup variables and stuff
+    # this is the entire run process be sure to call super().run() first
+    def run(self):
+        # always call first
+        super().run()
+        print(f"Running process: {self.__class__.__name__}")
+        # let's adjust our weights first to normalize them so the total is 1.0
+        total_weight = sum([model.weight for model in self.models_to_merge])
+        weight_adjust = 1.0 / total_weight
+        for model in self.models_to_merge:
+            model.weight *= weight_adjust
+        output_model: StableDiffusion = None
+        # let's do the merge, it is a good idea to use tqdm to show progress
+        for model_config in tqdm(self.models_to_merge, desc="Merging models"):
+            # setup model class with our helper class
+            sd_model = StableDiffusion(
+                device=self.device,
+                model_config=model_config,
+                dtype="float32"
+            )
+            # load the model
+            sd_model.load_model()
+            # adjust the weight of the text encoder
+            if isinstance(sd_model.text_encoder, list):
+                # sdxl model
+                for text_encoder in sd_model.text_encoder:
+                    for key, value in text_encoder.state_dict().items():
+                        value *= model_config.weight
+            else:
+                # normal model
+                for key, value in sd_model.text_encoder.state_dict().items():
+                    value *= model_config.weight
+            # adjust the weights of the unet
+            for key, value in sd_model.unet.state_dict().items():
+                value *= model_config.weight
+            if output_model is None:
+                # use this one as the base
+                output_model = sd_model
+            else:
+                # merge the models
+                # text encoder
+                if isinstance(output_model.text_encoder, list):
+                    # sdxl model
+                    for i, text_encoder in enumerate(output_model.text_encoder):
+                        for key, value in text_encoder.state_dict().items():
+                            value += sd_model.text_encoder[i].state_dict()[key]
+                else:
+                    # normal model
+                    for key, value in output_model.text_encoder.state_dict().items():
+                        value += sd_model.text_encoder.state_dict()[key]
+                # unet
+                for key, value in output_model.unet.state_dict().items():
+                    value += sd_model.unet.state_dict()[key]
+                # remove the model to free memory
+                del sd_model
+                flush()
+        # merge loop is done, let's save the model
+        print(f"Saving merged model to {self.save_path}")
+        output_model.save(self.save_path, meta=self.meta, save_dtype=self.save_dtype)
+        print(f"Saved merged model to {self.save_path}")
+        # do cleanup here
+        del output_model
+        flush()

ai-toolkit/extensions/example/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# This is an example extension for custom training. It is great for experimenting with new ideas.
+from toolkit.extension import Extension
+# We make a subclass of Extension
+class ExampleMergeExtension(Extension):
+    # uid must be unique, it is how the extension is identified
+    uid = "example_merge_extension"
+    # name is the name of the extension for printing
+    name = "Example Merge Extension"
+    # This is where your process class is loaded
+    # keep your imports in here so they don't slow down the rest of the program
+    @classmethod
+    def get_process(cls):
+        # import your process class here so it is only loaded when needed and return it
+        from .ExampleMergeModels import ExampleMergeModels
+        return ExampleMergeModels
+AI_TOOLKIT_EXTENSIONS = [
+    # you can put a list of extensions here
+    ExampleMergeExtension
+]

ai-toolkit/extensions/example/config/config.example.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+---
+# Always include at least one example config file to show how to use your extension.
+# use plenty of comments so users know how to use it and what everything does
+# all extensions will use this job name
+job: extension
+config:
+  name: 'my_awesome_merge'
+  process:
+    # Put your example processes here. This will be passed
+    # to your extension process in the config argument.
+    # the type MUST match your extension uid
+    - type: "example_merge_extension"
+      # save path for the merged model
+      save_path: "output/merge/[name].safetensors"
+      # save type
+      dtype: fp16
+      # device to run it on
+      device: cuda:0
+      # input models can only be SD1.x and SD2.x models for this example (currently)
+      models_to_merge:
+        # weights are relative, total weights will be normalized
+        # for example. If you have 2 models with weight 1.0, they will
+        # both be weighted 0.5. If you have 1 model with weight 1.0 and
+        # another with weight 2.0, the first will be weighted 1/3 and the
+        # second will be weighted 2/3
+        - name_or_path: "input/model1.safetensors"
+          weight: 1.0
+        - name_or_path: "input/model2.safetensors"
+          weight: 1.0
+        - name_or_path: "input/model3.safetensors"
+          weight: 0.3
+        - name_or_path: "input/model4.safetensors"
+          weight: 1.0
+# you can put any information you want here, and it will be saved in the model
+# the below is an example. I recommend doing trigger words at a minimum
+# in the metadata. The software will include this plus some other information
+meta:
+  name: "[name]"  # [name] gets replaced with the name above
+  description: A short description of your model
+  version: '0.1'
+  creator:
+    name: Your Name
+    email: your@email.com
+    website: https://yourwebsite.com
+  any: All meta data above is arbitrary, it can be whatever you want.

ai-toolkit/extensions_built_in/advanced_generator/Img2ImgGenerator.py ADDED Viewed

	@@ -0,0 +1,256 @@

+import math
+import os
+import random
+from collections import OrderedDict
+from typing import List
+import numpy as np
+from PIL import Image
+from diffusers import T2IAdapter
+from diffusers.utils.torch_utils import randn_tensor
+from torch.utils.data import DataLoader
+from diffusers import StableDiffusionXLImg2ImgPipeline, PixArtSigmaPipeline
+from tqdm import tqdm
+from toolkit.config_modules import ModelConfig, GenerateImageConfig, preprocess_dataset_raw_config, DatasetConfig
+from toolkit.data_transfer_object.data_loader import FileItemDTO, DataLoaderBatchDTO
+from toolkit.sampler import get_sampler
+from toolkit.stable_diffusion_model import StableDiffusion
+import gc
+import torch
+from jobs.process import BaseExtensionProcess
+from toolkit.data_loader import get_dataloader_from_datasets
+from toolkit.train_tools import get_torch_dtype
+from controlnet_aux.midas import MidasDetector
+from diffusers.utils import load_image
+from torchvision.transforms import ToTensor
+def flush():
+    torch.cuda.empty_cache()
+    gc.collect()
+class GenerateConfig:
+    def __init__(self, **kwargs):
+        self.prompts: List[str]
+        self.sampler = kwargs.get('sampler', 'ddpm')
+        self.neg = kwargs.get('neg', '')
+        self.seed = kwargs.get('seed', -1)
+        self.walk_seed = kwargs.get('walk_seed', False)
+        self.guidance_scale = kwargs.get('guidance_scale', 7)
+        self.sample_steps = kwargs.get('sample_steps', 20)
+        self.guidance_rescale = kwargs.get('guidance_rescale', 0.0)
+        self.ext = kwargs.get('ext', 'png')
+        self.denoise_strength = kwargs.get('denoise_strength', 0.5)
+        self.trigger_word = kwargs.get('trigger_word', None)
+class Img2ImgGenerator(BaseExtensionProcess):
+    def __init__(self, process_id: int, job, config: OrderedDict):
+        super().__init__(process_id, job, config)
+        self.output_folder = self.get_conf('output_folder', required=True)
+        self.copy_inputs_to = self.get_conf('copy_inputs_to', None)
+        self.device = self.get_conf('device', 'cuda')
+        self.model_config = ModelConfig(**self.get_conf('model', required=True))
+        self.generate_config = GenerateConfig(**self.get_conf('generate', required=True))
+        self.is_latents_cached = True
+        raw_datasets = self.get_conf('datasets', None)
+        if raw_datasets is not None and len(raw_datasets) > 0:
+            raw_datasets = preprocess_dataset_raw_config(raw_datasets)
+        self.datasets = None
+        self.datasets_reg = None
+        self.dtype = self.get_conf('dtype', 'float16')
+        self.torch_dtype = get_torch_dtype(self.dtype)
+        self.params = []
+        if raw_datasets is not None and len(raw_datasets) > 0:
+            for raw_dataset in raw_datasets:
+                dataset = DatasetConfig(**raw_dataset)
+                is_caching = dataset.cache_latents or dataset.cache_latents_to_disk
+                if not is_caching:
+                    self.is_latents_cached = False
+                if dataset.is_reg:
+                    if self.datasets_reg is None:
+                        self.datasets_reg = []
+                    self.datasets_reg.append(dataset)
+                else:
+                    if self.datasets is None:
+                        self.datasets = []
+                    self.datasets.append(dataset)
+        self.progress_bar = None
+        self.sd = StableDiffusion(
+            device=self.device,
+            model_config=self.model_config,
+            dtype=self.dtype,
+        )
+        print(f"Using device {self.device}")
+        self.data_loader: DataLoader = None
+        self.adapter: T2IAdapter = None
+    def to_pil(self, img):
+        # image comes in -1 to 1. convert to a PIL RGB image
+        img = (img + 1) / 2
+        img = img.clamp(0, 1)
+        img = img[0].permute(1, 2, 0).cpu().numpy()
+        img = (img * 255).astype(np.uint8)
+        image = Image.fromarray(img)
+        return image
+    def run(self):
+        with torch.no_grad():
+            super().run()
+            print("Loading model...")
+            self.sd.load_model()
+            device = torch.device(self.device)
+            if self.model_config.is_xl:
+                pipe = StableDiffusionXLImg2ImgPipeline(
+                    vae=self.sd.vae,
+                    unet=self.sd.unet,
+                    text_encoder=self.sd.text_encoder[0],
+                    text_encoder_2=self.sd.text_encoder[1],
+                    tokenizer=self.sd.tokenizer[0],
+                    tokenizer_2=self.sd.tokenizer[1],
+                    scheduler=get_sampler(self.generate_config.sampler),
+                ).to(device, dtype=self.torch_dtype)
+            elif self.model_config.is_pixart:
+                pipe = self.sd.pipeline.to(device, dtype=self.torch_dtype)
+            else:
+                raise NotImplementedError("Only XL models are supported")
+            pipe.set_progress_bar_config(disable=True)
+            # pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+            # midas_depth = torch.compile(midas_depth, mode="reduce-overhead", fullgraph=True)
+            self.data_loader = get_dataloader_from_datasets(self.datasets, 1, self.sd)
+            num_batches = len(self.data_loader)
+            pbar = tqdm(total=num_batches, desc="Generating images")
+            seed = self.generate_config.seed
+            # load images from datasets, use tqdm
+            for i, batch in enumerate(self.data_loader):
+                batch: DataLoaderBatchDTO = batch
+                gen_seed = seed if seed > 0 else random.randint(0, 2 ** 32 - 1)
+                generator = torch.manual_seed(gen_seed)
+                file_item: FileItemDTO = batch.file_items[0]
+                img_path = file_item.path
+                img_filename = os.path.basename(img_path)
+                img_filename_no_ext = os.path.splitext(img_filename)[0]
+                img_filename = img_filename_no_ext + '.' + self.generate_config.ext
+                output_path = os.path.join(self.output_folder, img_filename)
+                output_caption_path = os.path.join(self.output_folder, img_filename_no_ext + '.txt')
+                if self.copy_inputs_to is not None:
+                    output_inputs_path = os.path.join(self.copy_inputs_to, img_filename)
+                    output_inputs_caption_path = os.path.join(self.copy_inputs_to, img_filename_no_ext + '.txt')
+                else:
+                    output_inputs_path = None
+                    output_inputs_caption_path = None
+                caption = batch.get_caption_list()[0]
+                if self.generate_config.trigger_word is not None:
+                    caption = caption.replace('[trigger]', self.generate_config.trigger_word)
+                img: torch.Tensor = batch.tensor.clone()
+                image = self.to_pil(img)
+                # image.save(output_depth_path)
+                if self.model_config.is_pixart:
+                    pipe: PixArtSigmaPipeline = pipe
+                    # Encode the full image once
+                    encoded_image = pipe.vae.encode(
+                        pipe.image_processor.preprocess(image).to(device=pipe.device, dtype=pipe.dtype))
+                    if hasattr(encoded_image, "latent_dist"):
+                        latents = encoded_image.latent_dist.sample(generator)
+                    elif hasattr(encoded_image, "latents"):
+                        latents = encoded_image.latents
+                    else:
+                        raise AttributeError("Could not access latents of provided encoder_output")
+                    latents = pipe.vae.config.scaling_factor * latents
+                    # latents = self.sd.encode_images(img)
+                    # self.sd.noise_scheduler.set_timesteps(self.generate_config.sample_steps)
+                    # start_step = math.floor(self.generate_config.sample_steps * self.generate_config.denoise_strength)
+                    # timestep = self.sd.noise_scheduler.timesteps[start_step].unsqueeze(0)
+                    # timestep = timestep.to(device, dtype=torch.int32)
+                    # latent = latent.to(device, dtype=self.torch_dtype)
+                    # noise = torch.randn_like(latent, device=device, dtype=self.torch_dtype)
+                    # latent = self.sd.add_noise(latent, noise, timestep)
+                    # timesteps_to_use = self.sd.noise_scheduler.timesteps[start_step + 1:]
+                    batch_size = 1
+                    num_images_per_prompt = 1
+                    shape = (batch_size, pipe.transformer.config.in_channels, image.height // pipe.vae_scale_factor,
+                             image.width // pipe.vae_scale_factor)
+                    noise = randn_tensor(shape, generator=generator, device=pipe.device, dtype=pipe.dtype)
+                    # noise = torch.randn_like(latents, device=device, dtype=self.torch_dtype)
+                    num_inference_steps = self.generate_config.sample_steps
+                    strength = self.generate_config.denoise_strength
+                    # Get timesteps
+                    init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+                    t_start = max(num_inference_steps - init_timestep, 0)
+                    pipe.scheduler.set_timesteps(num_inference_steps, device="cpu")
+                    timesteps = pipe.scheduler.timesteps[t_start:]
+                    timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+                    latents = pipe.scheduler.add_noise(latents, noise, timestep)
+                    gen_images = pipe.__call__(
+                        prompt=caption,
+                        negative_prompt=self.generate_config.neg,
+                        latents=latents,
+                        timesteps=timesteps,
+                        width=image.width,
+                        height=image.height,
+                        num_inference_steps=num_inference_steps,
+                        num_images_per_prompt=num_images_per_prompt,
+                        guidance_scale=self.generate_config.guidance_scale,
+                        # strength=self.generate_config.denoise_strength,
+                        use_resolution_binning=False,
+                        output_type="np"
+                    ).images[0]
+                    gen_images = (gen_images * 255).clip(0, 255).astype(np.uint8)
+                    gen_images = Image.fromarray(gen_images)
+                else:
+                    pipe: StableDiffusionXLImg2ImgPipeline = pipe
+                    gen_images = pipe.__call__(
+                        prompt=caption,
+                        negative_prompt=self.generate_config.neg,
+                        image=image,
+                        num_inference_steps=self.generate_config.sample_steps,
+                        guidance_scale=self.generate_config.guidance_scale,
+                        strength=self.generate_config.denoise_strength,
+                    ).images[0]
+                os.makedirs(os.path.dirname(output_path), exist_ok=True)
+                gen_images.save(output_path)
+                # save caption
+                with open(output_caption_path, 'w') as f:
+                    f.write(caption)
+                if output_inputs_path is not None:
+                    os.makedirs(os.path.dirname(output_inputs_path), exist_ok=True)
+                    image.save(output_inputs_path)
+                    with open(output_inputs_caption_path, 'w') as f:
+                        f.write(caption)
+                pbar.update(1)
+                batch.cleanup()
+            pbar.close()
+            print("Done generating images")
+            # cleanup
+            del self.sd
+            gc.collect()
+            torch.cuda.empty_cache()

ai-toolkit/extensions_built_in/advanced_generator/PureLoraGenerator.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import os
+from collections import OrderedDict
+from toolkit.config_modules import ModelConfig, GenerateImageConfig, SampleConfig, LoRMConfig
+from toolkit.lorm import ExtractMode, convert_diffusers_unet_to_lorm
+from toolkit.sd_device_states_presets import get_train_sd_device_state_preset
+from toolkit.stable_diffusion_model import StableDiffusion
+import gc
+import torch
+from jobs.process import BaseExtensionProcess
+from toolkit.train_tools import get_torch_dtype
+def flush():
+    torch.cuda.empty_cache()
+    gc.collect()
+class PureLoraGenerator(BaseExtensionProcess):
+    def __init__(self, process_id: int, job, config: OrderedDict):
+        super().__init__(process_id, job, config)
+        self.output_folder = self.get_conf('output_folder', required=True)
+        self.device = self.get_conf('device', 'cuda')
+        self.device_torch = torch.device(self.device)
+        self.model_config = ModelConfig(**self.get_conf('model', required=True))
+        self.generate_config = SampleConfig(**self.get_conf('sample', required=True))
+        self.dtype = self.get_conf('dtype', 'float16')
+        self.torch_dtype = get_torch_dtype(self.dtype)
+        lorm_config = self.get_conf('lorm', None)
+        self.lorm_config = LoRMConfig(**lorm_config) if lorm_config is not None else None
+        self.device_state_preset = get_train_sd_device_state_preset(
+            device=torch.device(self.device),
+        )
+        self.progress_bar = None
+        self.sd = StableDiffusion(
+            device=self.device,
+            model_config=self.model_config,
+            dtype=self.dtype,
+        )
+    def run(self):
+        super().run()
+        print("Loading model...")
+        with torch.no_grad():
+            self.sd.load_model()
+            self.sd.unet.eval()
+            self.sd.unet.to(self.device_torch)
+            if isinstance(self.sd.text_encoder, list):
+                for te in self.sd.text_encoder:
+                    te.eval()
+                    te.to(self.device_torch)
+            else:
+                self.sd.text_encoder.eval()
+                self.sd.to(self.device_torch)
+            print(f"Converting to LoRM UNet")
+            # replace the unet with LoRMUnet
+            convert_diffusers_unet_to_lorm(
+                self.sd.unet,
+                config=self.lorm_config,
+            )
+            sample_folder = os.path.join(self.output_folder)
+            gen_img_config_list = []
+            sample_config = self.generate_config
+            start_seed = sample_config.seed
+            current_seed = start_seed
+            for i in range(len(sample_config.prompts)):
+                if sample_config.walk_seed:
+                    current_seed = start_seed + i
+                filename = f"[time]_[count].{self.generate_config.ext}"
+                output_path = os.path.join(sample_folder, filename)
+                prompt = sample_config.prompts[i]
+                extra_args = {}
+                gen_img_config_list.append(GenerateImageConfig(
+                    prompt=prompt,  # it will autoparse the prompt
+                    width=sample_config.width,
+                    height=sample_config.height,
+                    negative_prompt=sample_config.neg,
+                    seed=current_seed,
+                    guidance_scale=sample_config.guidance_scale,
+                    guidance_rescale=sample_config.guidance_rescale,
+                    num_inference_steps=sample_config.sample_steps,
+                    network_multiplier=sample_config.network_multiplier,
+                    output_path=output_path,
+                    output_ext=sample_config.ext,
+                    adapter_conditioning_scale=sample_config.adapter_conditioning_scale,
+                    **extra_args
+                ))
+            # send to be generated
+            self.sd.generate_images(gen_img_config_list, sampler=sample_config.sampler)
+            print("Done generating images")
+            # cleanup
+            del self.sd
+            gc.collect()
+            torch.cuda.empty_cache()

ai-toolkit/extensions_built_in/advanced_generator/ReferenceGenerator.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import os
+import random
+from collections import OrderedDict
+from typing import List
+import numpy as np
+from PIL import Image
+from diffusers import T2IAdapter
+from torch.utils.data import DataLoader
+from diffusers import StableDiffusionXLAdapterPipeline, StableDiffusionAdapterPipeline
+from tqdm import tqdm
+from toolkit.config_modules import ModelConfig, GenerateImageConfig, preprocess_dataset_raw_config, DatasetConfig
+from toolkit.data_transfer_object.data_loader import FileItemDTO, DataLoaderBatchDTO
+from toolkit.sampler import get_sampler
+from toolkit.stable_diffusion_model import StableDiffusion
+import gc
+import torch
+from jobs.process import BaseExtensionProcess
+from toolkit.data_loader import get_dataloader_from_datasets
+from toolkit.train_tools import get_torch_dtype
+from controlnet_aux.midas import MidasDetector
+from diffusers.utils import load_image
+def flush():
+    torch.cuda.empty_cache()
+    gc.collect()
+class GenerateConfig:
+    def __init__(self, **kwargs):
+        self.prompts: List[str]
+        self.sampler = kwargs.get('sampler', 'ddpm')
+        self.neg = kwargs.get('neg', '')
+        self.seed = kwargs.get('seed', -1)
+        self.walk_seed = kwargs.get('walk_seed', False)
+        self.t2i_adapter_path = kwargs.get('t2i_adapter_path', None)
+        self.guidance_scale = kwargs.get('guidance_scale', 7)
+        self.sample_steps = kwargs.get('sample_steps', 20)
+        self.prompt_2 = kwargs.get('prompt_2', None)
+        self.neg_2 = kwargs.get('neg_2', None)
+        self.prompts = kwargs.get('prompts', None)
+        self.guidance_rescale = kwargs.get('guidance_rescale', 0.0)
+        self.ext = kwargs.get('ext', 'png')
+        self.adapter_conditioning_scale = kwargs.get('adapter_conditioning_scale', 1.0)
+        if kwargs.get('shuffle', False):
+            # shuffle the prompts
+            random.shuffle(self.prompts)
+class ReferenceGenerator(BaseExtensionProcess):
+    def __init__(self, process_id: int, job, config: OrderedDict):
+        super().__init__(process_id, job, config)
+        self.output_folder = self.get_conf('output_folder', required=True)
+        self.device = self.get_conf('device', 'cuda')
+        self.model_config = ModelConfig(**self.get_conf('model', required=True))
+        self.generate_config = GenerateConfig(**self.get_conf('generate', required=True))
+        self.is_latents_cached = True
+        raw_datasets = self.get_conf('datasets', None)
+        if raw_datasets is not None and len(raw_datasets) > 0:
+            raw_datasets = preprocess_dataset_raw_config(raw_datasets)
+        self.datasets = None
+        self.datasets_reg = None
+        self.dtype = self.get_conf('dtype', 'float16')
+        self.torch_dtype = get_torch_dtype(self.dtype)
+        self.params = []
+        if raw_datasets is not None and len(raw_datasets) > 0:
+            for raw_dataset in raw_datasets:
+                dataset = DatasetConfig(**raw_dataset)
+                is_caching = dataset.cache_latents or dataset.cache_latents_to_disk
+                if not is_caching:
+                    self.is_latents_cached = False
+                if dataset.is_reg:
+                    if self.datasets_reg is None:
+                        self.datasets_reg = []
+                    self.datasets_reg.append(dataset)
+                else:
+                    if self.datasets is None:
+                        self.datasets = []
+                    self.datasets.append(dataset)
+        self.progress_bar = None
+        self.sd = StableDiffusion(
+            device=self.device,
+            model_config=self.model_config,
+            dtype=self.dtype,
+        )
+        print(f"Using device {self.device}")
+        self.data_loader: DataLoader = None
+        self.adapter: T2IAdapter = None
+    def run(self):
+        super().run()
+        print("Loading model...")
+        self.sd.load_model()
+        device = torch.device(self.device)
+        if self.generate_config.t2i_adapter_path is not None:
+            self.adapter = T2IAdapter.from_pretrained(
+                self.generate_config.t2i_adapter_path,
+                torch_dtype=self.torch_dtype,
+                varient="fp16"
+            ).to(device)
+        midas_depth = MidasDetector.from_pretrained(
+            "valhalla/t2iadapter-aux-models", filename="dpt_large_384.pt", model_type="dpt_large"
+        ).to(device)
+        if self.model_config.is_xl:
+            pipe = StableDiffusionXLAdapterPipeline(
+                vae=self.sd.vae,
+                unet=self.sd.unet,
+                text_encoder=self.sd.text_encoder[0],
+                text_encoder_2=self.sd.text_encoder[1],
+                tokenizer=self.sd.tokenizer[0],
+                tokenizer_2=self.sd.tokenizer[1],
+                scheduler=get_sampler(self.generate_config.sampler),
+                adapter=self.adapter,
+            ).to(device, dtype=self.torch_dtype)
+        else:
+            pipe = StableDiffusionAdapterPipeline(
+                vae=self.sd.vae,
+                unet=self.sd.unet,
+                text_encoder=self.sd.text_encoder,
+                tokenizer=self.sd.tokenizer,
+                scheduler=get_sampler(self.generate_config.sampler),
+                safety_checker=None,
+                feature_extractor=None,
+                requires_safety_checker=False,
+                adapter=self.adapter,
+            ).to(device, dtype=self.torch_dtype)
+        pipe.set_progress_bar_config(disable=True)
+        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+        # midas_depth = torch.compile(midas_depth, mode="reduce-overhead", fullgraph=True)
+        self.data_loader = get_dataloader_from_datasets(self.datasets, 1, self.sd)
+        num_batches = len(self.data_loader)
+        pbar = tqdm(total=num_batches, desc="Generating images")
+        seed = self.generate_config.seed
+        # load images from datasets, use tqdm
+        for i, batch in enumerate(self.data_loader):
+            batch: DataLoaderBatchDTO = batch
+            file_item: FileItemDTO = batch.file_items[0]
+            img_path = file_item.path
+            img_filename = os.path.basename(img_path)
+            img_filename_no_ext = os.path.splitext(img_filename)[0]
+            output_path = os.path.join(self.output_folder, img_filename)
+            output_caption_path = os.path.join(self.output_folder, img_filename_no_ext + '.txt')
+            output_depth_path = os.path.join(self.output_folder, img_filename_no_ext + '.depth.png')
+            caption = batch.get_caption_list()[0]
+            img: torch.Tensor = batch.tensor.clone()
+            # image comes in -1 to 1. convert to a PIL RGB image
+            img = (img + 1) / 2
+            img = img.clamp(0, 1)
+            img = img[0].permute(1, 2, 0).cpu().numpy()
+            img = (img * 255).astype(np.uint8)
+            image = Image.fromarray(img)
+            width, height = image.size
+            min_res = min(width, height)
+            if self.generate_config.walk_seed:
+                seed = seed + 1
+            if self.generate_config.seed == -1:
+                # random
+                seed = random.randint(0, 1000000)
+            torch.manual_seed(seed)
+            torch.cuda.manual_seed(seed)
+            # generate depth map
+            image = midas_depth(
+                image,
+                detect_resolution=min_res,  # do 512 ?
+                image_resolution=min_res
+            )
+            # image.save(output_depth_path)
+            gen_images = pipe(
+                prompt=caption,
+                negative_prompt=self.generate_config.neg,
+                image=image,
+                num_inference_steps=self.generate_config.sample_steps,
+                adapter_conditioning_scale=self.generate_config.adapter_conditioning_scale,
+                guidance_scale=self.generate_config.guidance_scale,
+            ).images[0]
+            os.makedirs(os.path.dirname(output_path), exist_ok=True)
+            gen_images.save(output_path)
+            # save caption
+            with open(output_caption_path, 'w') as f:
+                f.write(caption)
+            pbar.update(1)
+            batch.cleanup()
+        pbar.close()
+        print("Done generating images")
+        # cleanup
+        del self.sd
+        gc.collect()
+        torch.cuda.empty_cache()

ai-toolkit/extensions_built_in/advanced_generator/__init__.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# This is an example extension for custom training. It is great for experimenting with new ideas.
+from toolkit.extension import Extension
+# This is for generic training (LoRA, Dreambooth, FineTuning)
+class AdvancedReferenceGeneratorExtension(Extension):
+    # uid must be unique, it is how the extension is identified
+    uid = "reference_generator"
+    # name is the name of the extension for printing
+    name = "Reference Generator"
+    # This is where your process class is loaded
+    # keep your imports in here so they don't slow down the rest of the program
+    @classmethod
+    def get_process(cls):
+        # import your process class here so it is only loaded when needed and return it
+        from .ReferenceGenerator import ReferenceGenerator
+        return ReferenceGenerator
+# This is for generic training (LoRA, Dreambooth, FineTuning)
+class PureLoraGenerator(Extension):
+    # uid must be unique, it is how the extension is identified
+    uid = "pure_lora_generator"
+    # name is the name of the extension for printing
+    name = "Pure LoRA Generator"
+    # This is where your process class is loaded
+    # keep your imports in here so they don't slow down the rest of the program
+    @classmethod
+    def get_process(cls):
+        # import your process class here so it is only loaded when needed and return it
+        from .PureLoraGenerator import PureLoraGenerator
+        return PureLoraGenerator
+# This is for generic training (LoRA, Dreambooth, FineTuning)
+class Img2ImgGeneratorExtension(Extension):
+    # uid must be unique, it is how the extension is identified
+    uid = "batch_img2img"
+    # name is the name of the extension for printing
+    name = "Img2ImgGeneratorExtension"
+    # This is where your process class is loaded
+    # keep your imports in here so they don't slow down the rest of the program
+    @classmethod
+    def get_process(cls):
+        # import your process class here so it is only loaded when needed and return it
+        from .Img2ImgGenerator import Img2ImgGenerator
+        return Img2ImgGenerator
+AI_TOOLKIT_EXTENSIONS = [
+    # you can put a list of extensions here
+    AdvancedReferenceGeneratorExtension, PureLoraGenerator, Img2ImgGeneratorExtension
+]

ai-toolkit/extensions_built_in/advanced_generator/config/train.example.yaml ADDED Viewed

	@@ -0,0 +1,91 @@

+---
+job: extension
+config:
+  name: test_v1
+  process:
+    - type: 'textual_inversion_trainer'
+      training_folder: "out/TI"
+      device: cuda:0
+      # for tensorboard logging
+      log_dir: "out/.tensorboard"
+      embedding:
+        trigger: "your_trigger_here"
+        tokens: 12
+        init_words: "man with short brown hair"
+        save_format: "safetensors"  # 'safetensors' or 'pt'
+      save:
+        dtype: float16 # precision to save
+        save_every: 100 # save every this many steps
+        max_step_saves_to_keep: 5 # only affects step counts
+      datasets:
+        - folder_path: "/path/to/dataset"
+          caption_ext: "txt"
+          default_caption: "[trigger]"
+          buckets: true
+          resolution: 512
+      train:
+        noise_scheduler: "ddpm" # or "ddpm", "lms", "euler_a"
+        steps: 3000
+        weight_jitter: 0.0
+        lr: 5e-5
+        train_unet: false
+        gradient_checkpointing: true
+        train_text_encoder: false
+        optimizer: "adamw"
+#        optimizer: "prodigy"
+        optimizer_params:
+          weight_decay: 1e-2
+        lr_scheduler: "constant"
+        max_denoising_steps: 1000
+        batch_size: 4
+        dtype: bf16
+        xformers: true
+        min_snr_gamma: 5.0
+#        skip_first_sample: true
+        noise_offset: 0.0 # not needed for this
+      model:
+        # objective reality v2
+        name_or_path: "https://civitai.com/models/128453?modelVersionId=142465"
+        is_v2: false  # for v2 models
+        is_xl: false  # for SDXL models
+        is_v_pred: false # for v-prediction models (most v2 models)
+      sample:
+        sampler: "ddpm" # must match train.noise_scheduler
+        sample_every: 100 # sample every this many steps
+        width: 512
+        height: 512
+        prompts:
+          - "photo of [trigger] laughing"
+          - "photo of [trigger] smiling"
+          - "[trigger] close up"
+          - "dark scene [trigger] frozen"
+          - "[trigger] nighttime"
+          - "a painting of [trigger]"
+          - "a drawing of [trigger]"
+          - "a cartoon of [trigger]"
+          - "[trigger] pixar style"
+          - "[trigger] costume"
+        neg: ""
+        seed: 42
+        walk_seed: false
+        guidance_scale: 7
+        sample_steps: 20
+        network_multiplier: 1.0
+      logging:
+        log_every: 10 # log every this many steps
+        use_wandb: false # not supported yet
+        verbose: false
+# You can put any information you want here, and it will be saved in the model.
+# The below is an example, but you can put your grocery list in it if you want.
+# It is saved in the model so be aware of that. The software will include this
+# plus some other information for you automatically
+meta:
+  # [name] gets replaced with the name above
+  name: "[name]"
+#  version: '1.0'
+#  creator:
+#    name: Your Name
+#    email: your@gmail.com
+#    website: https://your.website

ai-toolkit/extensions_built_in/audio_models/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from .ace_step import AceStep15Model, AceStep15XLModel
+AI_TOOLKIT_MODELS = [
+    # put a list of models here
+    AceStep15Model,
+    AceStep15XLModel,
+]

ai-toolkit/extensions_built_in/audio_models/ace_step/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .ace_step_15_model import AceStep15Model, AceStep15XLModel

ai-toolkit/extensions_built_in/audio_models/ace_step/ace_step_15_model.py ADDED Viewed

	@@ -0,0 +1,335 @@

+import json
+import os
+from typing import List, Optional
+import huggingface_hub
+import torch
+from safetensors.torch import load_file, save_file
+from extensions_built_in.audio_models.base_audio_model import BaseAudioModel
+from toolkit.basic import flush
+from toolkit.config_modules import GenerateImageConfig
+from toolkit.prompt_utils import PromptEmbeds, concat_prompt_embeds
+from toolkit.samplers.custom_flowmatch_sampler import (
+    CustomFlowMatchEulerDiscreteScheduler,
+)
+from toolkit.util.quantize import get_qtype, quantize, quantize_model
+from optimum.quanto import freeze
+from .src.model import (
+    AceStep15,
+    OobleckVAE,
+    TextEncoder,
+    get_silence_latent,
+    load_models,
+)
+from transformers import AutoTokenizer
+from .src.pipeline import AceStep15Pipeline
+scheduler_config = {
+    "num_train_timesteps": 1000,
+    "shift": 3.0,
+    "use_dynamic_shifting": False,
+}
+def to_number(str_or_number, default):
+    if isinstance(str_or_number, (int, float)):
+        return str_or_number
+    if str_or_number is None:
+        return default
+    if str_or_number == "":
+        return default
+    try:
+        return float(str_or_number)
+    except ValueError:
+        try:
+            return int(str_or_number)
+        except ValueError as e:
+            raise ValueError(f"Could not convert {str_or_number} to a number") from e
+def parse_ace_step_caption(text):
+    """Parse a tagged caption file back into a dict."""
+    import re
+    def tag(name):
+        m = re.search(rf"<{name}>(.*?)</{name}>", text, re.DOTALL)
+        return m.group(1).strip() if m else ""
+    return {
+        "caption": tag("CAPTION"),
+        "lyrics": tag("LYRICS"),
+        "bpm": to_number(tag("BPM"), 120),
+        "keyscale": tag("KEYSCALE"),
+        "timesignature": tag("TIMESIGNATURE"),
+        "duration": to_number(tag("DURATION"), 1.0),
+        "language": tag("LANGUAGE"),
+    }
+class AceStep15Model(BaseAudioModel):
+    arch = "ace_step_15"
+    sample_rate = 48000
+    def __init__(
+        self,
+        device,
+        model_config,
+        dtype="bf16",
+        custom_pipeline=None,
+        noise_scheduler=None,
+        **kwargs,
+    ):
+        super().__init__(
+            device, model_config, dtype, custom_pipeline, noise_scheduler, **kwargs
+        )
+        self.is_flow_matching = True
+        self.is_transformer = True
+        # self.target_lora_modules = ['AceStep15']
+        self.target_lora_modules = ["DiTModel"]
+    # static method to get the noise scheduler
+    @staticmethod
+    def get_train_scheduler():
+        return CustomFlowMatchEulerDiscreteScheduler(**scheduler_config)
+    def load_model(self):
+        dtype = self.torch_dtype
+        device = self.device_torch
+        model_path = self.model_config.name_or_path
+        if not os.path.exists(model_path):
+            # assume it is a hf repo like org/repo/filename.safetensors
+            path_parts = model_path.split("/")
+            if len(path_parts) != 3:
+                raise ValueError(
+                    f"Model path {model_path} does not exist and is not a valid Hugging Face repo path"
+                )
+            model_path = huggingface_hub.hf_hub_download(
+                repo_id=f"{path_parts[0]}/{path_parts[1]}",
+                filename=path_parts[2],
+            )
+        # load the models from the single safetensors file
+        load_device = device
+        if self.model_config.low_vram:
+            load_device = "cpu"
+        models = load_models(model_path, device=load_device, dtype=dtype)
+        self.model = models["model"]
+        if self.model_config.quantize:
+            self.print_and_status_update("Quantizing Transformer")
+            # quantize_model(self, self.model.decoder)
+            quantize(self.model, weights=get_qtype(self.model_config.qtype))
+            freeze(self.model)
+            flush()
+        if self.model_config.low_vram:
+            self.print_and_status_update("Moving transformer to CPU")
+            self.model.to("cpu")
+        if (
+            self.model_config.layer_offloading
+            and self.model_config.layer_offloading_transformer_percent > 0
+        ):
+            raise NotImplementedError("Layer offloading not yet implemented for AceStep15Model")
+        self.text_encoder = models["text_encoder"]
+        if self.model_config.quantize_te:
+            self.print_and_status_update("Quantizing Text Encoder")
+            quantize(self.text_encoder, weights=get_qtype(self.model_config.qtype_te))
+            freeze(self.text_encoder)
+            flush()
+        self.vae = models["vae"]
+        # move back to device
+        self.model.to(device)
+        self.text_encoder.to(device)
+        self.vae.to(device)
+        self.tokenizer = models["tokenizer"]
+        self.pipeline = AceStep15Pipeline(
+            transformer=self.model,
+            vae=self.vae,
+            text_encoder=self.text_encoder,
+            tokenizer=self.tokenizer,
+            scheduler=self.get_train_scheduler(),
+        )
+        if self.model_config.low_vram:
+            self.pipeline.do_tiled_decoding = True
+    def get_prompt_embeds(self, prompt: str) -> PromptEmbeds:
+        if isinstance(prompt, str):
+            prompts = [prompt]
+        else:
+            prompts = prompt
+        if self.text_encoder.device == torch.device("cpu"):
+            self.text_encoder.to(self.device_torch)
+        # we need the encoder from the model
+        if self.model.encoder.device == torch.device("cpu"):
+            self.model.encoder.to(self.device_torch)
+        # the prompt should be json as a string. Try to parse it.
+        json_prompts = []
+        for p in prompts:
+            try:
+                json_prompts.append(parse_ace_step_caption(p))
+            except json.JSONDecodeError:
+                raise ValueError(
+                    f"Prompt {p} is not a valid JSON string. Prompts must be JSON for this model"
+                )
+        if self.pipeline.text_encoder.device == torch.device("cpu"):
+            self.pipeline.text_encoder.to(self.device_torch)
+        device = self.text_encoder.device
+        dtype = self.text_encoder.dtype
+        batch_pe = None
+        # TODO not sure this will allow for proper batching
+        for json_prompt in json_prompts:
+            prompt = json_prompt.get("caption", "")
+            lyrics = json_prompt.get("lyrics", "")
+            bpm = json_prompt.get("bpm", 120)
+            key = json_prompt.get("key", "C")
+            time_sig = json_prompt.get("time_sig", "4/4")
+            duration = json_prompt.get("duration", 10)
+            duration = int(duration) if isinstance(duration, (int, float)) else 10
+            language = json_prompt.get("language", "en")
+            text_embeddings, text_mask, lyric_embeddings, lyric_mask = (
+                self.pipeline.get_text_embedings(
+                    prompt, lyrics, bpm, key, time_sig, duration, language
+                )
+            )
+            latent_len = int(duration * self.pipeline.LATENT_RATE)
+            # Silence as source latent [1, 64, T] -> [1, T, 64] for DiT
+            sil = get_silence_latent(latent_len, device, dtype)  # [1, 64, T]
+            src = sil.transpose(1, 2)  # [1, T, 64]
+            chunk_masks = torch.ones_like(src)
+            # Reference audio (silence)
+            ref = sil[:, :, :750].transpose(1, 2)  # [1, 750, 64]
+            ref_order = torch.zeros(1, device=device, dtype=torch.long)
+            enc_h, enc_m, _ = self.pipeline.transformer.prepare_condition(
+                text_embeddings,
+                text_mask,
+                lyric_embeddings,
+                lyric_mask,
+                ref,
+                ref_order,
+                src,
+                chunk_masks,
+            )
+            pe = PromptEmbeds(enc_h, attention_mask=enc_m)
+            if batch_pe is None:
+                batch_pe = pe
+            else:
+                batch_pe = concat_prompt_embeds(batch_pe, pe)
+        return batch_pe
+    def get_transformer_block_names(self) -> Optional[List[str]]:
+        return ["layers"]
+    def get_generation_pipeline(self):
+        return self.pipeline
+    def generate_single_audio(
+        self,
+        pipeline,
+        gen_config: GenerateImageConfig,
+        conditional_embeds: PromptEmbeds,
+        unconditional_embeds: PromptEmbeds,
+        generator: torch.Generator,
+        extra: dict,
+    ):
+        if self.model.device == torch.device("cpu"):
+            self.model.to(self.device_torch)
+        # make sure gen config is setup for audio
+        if gen_config.output_ext not in ['mp3', 'wav']:
+            gen_config.output_ext = 'mp3'
+        prompt = gen_config.prompt
+        json_prompt = parse_ace_step_caption(prompt)
+        prompt = json_prompt.get("caption", "")
+        lyrics = json_prompt.get("lyrics", "")
+        bpm = json_prompt.get("bpm", 120)
+        key = json_prompt.get("key", "C")
+        time_sig = json_prompt.get("time_sig", "4/4")
+        duration = json_prompt.get("duration", 0)
+        language = json_prompt.get("language", "en")
+        output = self.pipeline(
+            prompt=None,  # we are passing in the embeds directly, so no need for a prompt
+            encoder_embeddings=conditional_embeds.text_embeds.to(self.device_torch, dtype=self.torch_dtype),
+            encoder_mask=conditional_embeds.attention_mask.to(self.device_torch, dtype=torch.bool),
+            num_inference_steps=gen_config.num_inference_steps,
+            duration=duration,
+            generator=generator,
+            bpm=bpm,
+            key=key,
+            time_sig=time_sig,
+            language=language,
+            guidance_scale=gen_config.guidance_scale,
+        )
+        return output
+    def get_noise_prediction(
+        self,
+        latent_model_input: torch.Tensor, #(1, 300, 64)
+        timestep: torch.Tensor,  # 0 to 1000 scale
+        text_embeddings: PromptEmbeds,
+        **kwargs,
+    ):
+        if self.model.decoder.device == torch.device("cpu"):
+            self.model.decoder.to(self.device_torch)
+        with torch.no_grad():
+            model: AceStep15 = self.model
+            tt = timestep.to(self.device_torch, dtype=torch.long) / 1000
+            latent_len = latent_model_input.shape[1]
+            device = self.device_torch
+            dtype = self.torch_dtype
+            attn = torch.ones(1, latent_len, device=device, dtype=dtype)
+            # build context from silence latent matching the actual input length
+            sil = get_silence_latent(latent_len, device, dtype)  # [1, 64, T]
+            src = sil.transpose(1, 2)  # [1, T, 64]
+            chunk_masks = torch.ones_like(src)
+            context = torch.cat([src, chunk_masks], dim=-1)  # [1, T, 128]
+        pred = model.decoder(
+            x=latent_model_input.detach(),
+            timestep=tt.detach(),
+            timestep_r=tt.detach(),
+            attention_mask=attn.detach(),
+            enc_h=text_embeddings.text_embeds.to(self.device_torch, dtype=self.torch_dtype).detach(),
+            enc_m=text_embeddings.attention_mask.to(self.device_torch, dtype=torch.bool).detach(),
+            context=context.detach(),
+        )
+        return pred
+    def get_loss_target(self, *args, **kwargs):
+        noise = kwargs.get("noise")
+        batch = kwargs.get("batch")
+        return (noise - batch.latents).detach()
+    def encode_audio(self, audio_tensor: torch.Tensor, device=None, dtype=None):
+        if device is None:
+            device = self.device_torch
+        if dtype is None:
+            dtype = self.torch_dtype
+        if self.vae.device == torch.device("cpu"):
+            self.vae.to(device)
+        output = self.vae.encode(audio_tensor.to(device=device, dtype=dtype))
+        # transpose from [B, 64, T] to [B, T, 64] for DiT
+        output = output.transpose(1, 2).contiguous()
+        return output
+class AceStep15XLModel(AceStep15Model):
+    arch = "ace_step_15_xl"

ai-toolkit/extensions_built_in/audio_models/ace_step/src/__init__.py ADDED Viewed

File without changes