diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 000000000..dd4c1e363
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,7 @@
+.cache/
+cudnn_windows/
+bitsandbytes_windows/
+bitsandbytes_windows_deprecated/
+dataset/
+__pycache__/
+venv/
diff --git a/.gitignore b/.gitignore
index b1b3562c2..af16d51fd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -11,4 +11,6 @@ gui-user.bat
 gui-user.ps1
 .vscode
 wandb
-setup.log
\ No newline at end of file
+setup.log
+logs
+SmilingWolf
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 000000000..64f66d3f9
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,47 @@
+FROM nvcr.io/nvidia/pytorch:23.04-py3 as base
+ENV DEBIAN_FRONTEND=noninteractive
+ENV TZ=Europe/London
+
+RUN apt update && apt-get install -y software-properties-common
+RUN add-apt-repository ppa:deadsnakes/ppa && \
+    apt update && \
+    apt-get install -y git curl libgl1 libglib2.0-0 libgoogle-perftools-dev \
+    python3.10-dev python3.10-tk python3-html5lib python3-apt python3-pip python3.10-distutils && \
+    rm -rf /var/lib/apt/lists/*
+
+# Set python 3.10 as default
+RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 3 && \
+    update-alternatives --config python3
+
+RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3
+
+WORKDIR /app
+RUN python3 -m pip install wheel
+
+# Todo: Install torch 2.1.0 for cu121 support (only available as nightly as of writing)
+## RUN python3 -m pip install --pre torch ninja setuptools --extra-index-url https://download.pytorch.org/whl/nightly/cu121
+
+# Todo: Install xformers nightly for Torch 2.1.0 support
+## RUN python3 -m pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
+
+# Install requirements
+COPY requirements.txt setup.py ./
+RUN python3 -m pip install --use-pep517 -r requirements.txt xformers
+
+# Replace pillow with pillow-simd
+RUN python3 -m pip uninstall -y pillow && \
+    CC="cc -mavx2" python3 -m pip install -U --force-reinstall pillow-simd
+
+# Fix missing libnvinfer7
+USER root
+RUN ln -s /usr/lib/x86_64-linux-gnu/libnvinfer.so /usr/lib/x86_64-linux-gnu/libnvinfer.so.7 && \
+    ln -s /usr/lib/x86_64-linux-gnu/libnvinfer_plugin.so /usr/lib/x86_64-linux-gnu/libnvinfer_plugin.so.7
+
+RUN useradd -m -s /bin/bash appuser
+USER appuser
+COPY --chown=appuser . .
+
+STOPSIGNAL SIGINT
+ENV LD_PRELOAD=libtcmalloc.so
+ENV PATH="$PATH:/home/appuser/.local/bin"
+CMD python3 "./kohya_gui.py" ${CLI_ARGS} --listen 0.0.0.0 --server_port 7860
diff --git a/README-ja.md b/README-ja.md
index 47aaf16e3..6f0e574e2 100644
--- a/README-ja.md
+++ b/README-ja.md
@@ -16,13 +16,13 @@ GUIやPowerShellスクリプトなど、より使いやすくする機能が[bma
 
 当リポジトリ内およびnote.comに記事がありますのでそちらをご覧ください（将来的にはすべてこちらへ移すかもしれません）。
 
-* [学習について、共通編](./train_README-ja.md) : データ整備やオプションなど
-    * [データセット設定](./config_README-ja.md)
-* [DreamBoothの学習について](./train_db_README-ja.md)
-* [fine-tuningのガイド](./fine_tune_README_ja.md):
-* [LoRAの学習について](./train_network_README-ja.md)
-* [Textual Inversionの学習について](./train_ti_README-ja.md)
-* note.com [画像生成スクリプト](https://note.com/kohya_ss/n/n2693183a798e)
+* [学習について、共通編](./docs/train_README-ja.md) : データ整備やオプションなど
+    * [データセット設定](./docs/config_README-ja.md)
+* [DreamBoothの学習について](./docs/train_db_README-ja.md)
+* [fine-tuningのガイド](./docs/fine_tune_README_ja.md):
+* [LoRAの学習について](./docs/train_network_README-ja.md)
+* [Textual Inversionの学習について](./docs/train_ti_README-ja.md)
+* [画像生成スクリプト](./docs/gen_img_README-ja.md)
 * note.com [モデル変換スクリプト](https://note.com/kohya_ss/n/n374f316fe4ad)
 
 ## Windowsでの動作に必要なプログラム
@@ -115,6 +115,16 @@ accelerate configの質問には以下のように答えてください。（bf1
 
 他のバージョンでは学習がうまくいかない場合があるようです。特に他の理由がなければ指定のバージョンをお使いください。
 
+### オプション：Lion8bitを使う
+
+Lion8bitを使う場合には`bitsandbytes`を0.38.0以降にアップグレードする必要があります。`bitsandbytes`をアンインストールし、Windows環境では例えば[こちら](https://github.com/jllllll/bitsandbytes-windows-webui)などからWindows版のwhlファイルをインストールしてください。たとえば以下のような手順になります。
+
+```powershell
+pip install https://github.com/jllllll/bitsandbytes-windows-webui/raw/main/bitsandbytes-0.38.1-py3-none-any.whl
+```
+
+アップグレード時には`pip install .`でこのリポジトリを更新し、必要に応じて他のパッケージもアップグレードしてください。
+
 ## アップグレード
 
 新しいリリースがあった場合、以下のコマンドで更新できます。
diff --git a/README.md b/README.md
index f0d60cb69..5c7a1d14e 100644
--- a/README.md
+++ b/README.md
@@ -2,14 +2,22 @@
 
 This repository provides a Windows-focused Gradio GUI for [Kohya's Stable Diffusion trainers](https://github.com/kohya-ss/sd-scripts). The GUI allows you to set the training parameters and generate and run the required CLI commands to train the model.
 
-If you run on Linux and would like to use the GUI, there is now a port of it as a docker container. You can find the project [here](https://github.com/P2Enjoy/kohya_ss-docker).
-
 ### Table of Contents
 
 - [Tutorials](#tutorials)
+* [Training guide - common](./docs/train_README-ja.md) : data preparation, options etc... 
+  * [Chinese version](./docs/train_README-zh.md)
+  * [Dataset config](./docs/config_README-ja.md) 
+  * [DreamBooth training guide](./docs/train_db_README-ja.md)
+  * [Step by Step fine-tuning guide](./docs/fine_tune_README_ja.md):
+  * [Training LoRA](./docs/train_network_README-ja.md)
+  * [training Textual Inversion](./docs/train_ti_README-ja.md)
+  * [Image generation](./docs/gen_img_README-ja.md)
+  * [Model conversion](https://note.com/kohya_ss/n/n374f316fe4ad)
 - [Required Dependencies](#required-dependencies)
   - [Linux/macOS](#linux-and-macos-dependencies)
 - [Installation](#installation)
+    - [Docker](#docker)
     - [Linux/macOS](#linux-and-macos)
       - [Default Install Locations](#install-location)
     - [Windows](#windows)
@@ -45,6 +53,10 @@ Newer Tutorial: [Generate Studio Quality Realistic Photos By Kohya LoRA Stable D
 
 [![Newer Tutorial: Generate Studio Quality Realistic Photos By Kohya LoRA Stable Diffusion Training](https://user-images.githubusercontent.com/19240467/235306147-85dd8126-f397-406b-83f2-368927fa0281.png)](https://www.youtube.com/watch?v=TpuDOsuKIBo)
 
+Newer Tutorial: [How To Install And Use Kohya LoRA GUI / Web UI on RunPod IO](https://www.youtube.com/watch?v=3uzCNrQao3o):
+
+[![How To Install And Use Kohya LoRA GUI / Web UI on RunPod IO With Stable Diffusion & Automatic1111](https://github-production-user-asset-6210df.s3.amazonaws.com/19240467/238678226-0c9c3f7d-c308-4793-b790-999fdc271372.png)](https://www.youtube.com/watch?v=3uzCNrQao3o)
+
 ## Required Dependencies
 
 - Install [Python 3.10](https://www.python.org/ftp/python/3.10.9/python-3.10.9-amd64.exe) 
@@ -61,10 +73,34 @@ These dependencies are taken care of via `setup.sh` in the installation section.
 ### Runpod
 Follow the instructions found in this discussion: https://github.com/bmaltais/kohya_ss/discussions/379
 
+### Docker
+Docker is supported on Windows and Linux distributions. However this method currently only supports Nvidia GPUs. 
+Run the following commands in your OS shell after installing [git](https://git-scm.com/download/) and [docker](https://www.docker.com/products/docker-desktop/):
+```bash
+git clone https://github.com/bmaltais/kohya_ss.git
+cd kohya_ss
+docker compose build
+docker compose run --service-ports kohya-ss-gui
+```
+
+This will take a while (up to 20 minutes) on the first run.
+
+The following limitations apply:
+* All training data must be added to the `dataset` subdirectory, the docker container cannot access any other files
+* The file picker does not work
+  * Cannot select folders, folder path must be set manually like e.g. /dataset/my_lora/img
+  * Cannot select config file, it must be loaded via path instead like e.g. /dataset/my_config.json  
+* Dialogs do not work
+  * Make sure your file names are unique as this happens when asking if an existing file should be overridden
+* No auto-update support. Must run update scripts outside docker manually and then rebuild with `docker compose build`.
+
+
+If you run on Linux, there is an alternative docker container port with less limitations. You can find the project [here](https://github.com/P2Enjoy/kohya_ss-docker).
+
 ### Linux and macOS
 In the terminal, run
 
-```
+```bash
 git clone https://github.com/bmaltais/kohya_ss.git
 cd kohya_ss
 # May need to chmod +x ./setup.sh if you're on a machine with stricter security.
@@ -259,7 +295,7 @@ The LoRA supported by `train_network.py` has been named to avoid confusion. The
     
 LoRA-LierLa is the default LoRA type for `train_network.py` (without `conv_dim` network arg). LoRA-LierLa can be used with [our extension](https://github.com/kohya-ss/sd-webui-additional-networks) for AUTOMATIC1111's Web UI, or with the built-in LoRA feature of the Web UI.
 
-To use LoRA-C3Liar with Web UI, please use our extension.
+To use LoRA-C3Lier with Web UI, please use our extension.
 
 ## Sample image generation during training
 A prompt file might look like this, for example
@@ -309,45 +345,57 @@ This will store a backup file with your current locally installed pip packages a
 
 ## Change History
 
-* 2023/04/25 (v21.5.7)
-  - `tag_images_by_wd14_tagger.py` can now get arguments from outside. [PR #453](https://github.com/kohya-ss/sd-scripts/pull/453) Thanks to mio2333!
-  - Added `--save_every_n_steps` option to each training script. The model is saved every specified steps.
-    - `--save_last_n_steps` option can be used to save only the specified number of models (old models will be deleted).
-    - If you specify the `--save_state` option, the state will also be saved at the same time. You can specify the number of steps to keep the state with the `--save_last_n_steps_state` option (the same value as `--save_last_n_steps` is used if omitted).
-    - You can use the epoch-based model saving and state saving options together.
-    - Not tested in multi-GPU environment. Please report any bugs.
-  - `--cache_latents_to_disk` option automatically enables `--cache_latents` option when specified. [#438](https://github.com/kohya-ss/sd-scripts/issues/438)
-  - Fixed a bug in `gen_img_diffusers.py` where latents upscaler would fail with a batch size of 2 or more.
-  - Fix issue with using earlier version than python 3.10 in Linux. Thanks @Whyjsee
-* 2023/04/24 (v21.5.6)
-    - Fix triton error
-    - Fix issue with merge lora path with spaces
-    - Added support for logging to wandb. Please refer to PR #428. Thank you p1atdev!
-      - wandb installation is required. Please install it with pip install wandb. Login to wandb with wandb login command, or set --wandb_api_key option for automatic login.
-      - Please let me know if you find any bugs as the test is not complete.
-    - You can automatically login to wandb by setting the --wandb_api_key option. Please be careful with the handling of API Key. PR #435 Thank you Linaqruf!
-    - Improved the behavior of --debug_dataset on non-Windows environments. PR #429 Thank you tsukimiya!
-    - Fixed --face_crop_aug option not working in Fine tuning method.
-    - Prepared code to use any upscaler in gen_img_diffusers.py.
-    - Fixed to log to TensorBoard when --logging_dir is specified and --log_with is not specified.
-* 2023/04/22 (v21.5.5)
-    - Update LoRA merge GUI to support SD checkpoint merge and up to 4 LoRA merging
-    - Fixed `lora_interrogator.py` not working. Please refer to [PR #392](https://github.com/kohya-ss/sd-scripts/pull/392) for details. Thank you A2va and heyalexchoi!
-    - Fixed the handling of tags containing `_` in `tag_images_by_wd14_tagger.py`.
-    - Add new Extract DyLoRA gui to the Utilities tab.
-    - Add new Merge LyCORIS models into checkpoint gui to the Utilities tab.
-    - Add new info on startup to help debug things
-* 2023/04/17 (v21.5.4)
-    - Fixed a bug that caused an error when loading DyLoRA with the `--network_weight` option in `train_network.py`.
-    - Added the `--recursive` option to each script in the `finetune` folder to process folders recursively. Please refer to [PR #400](https://github.com/kohya-ss/sd-scripts/pull/400/) for details. Thanks to Linaqruf!
-    - Upgrade Gradio to latest release
-    - Fix issue when Adafactor is used as optimizer and LR Warmup is not 0: https://github.com/bmaltais/kohya_ss/issues/617
-    - Added support for DyLoRA in `train_network.py`. Please refer to [here](./train_network_README-ja.md#dylora) for details (currently only in Japanese).
-    - Added support for caching latents to disk in each training script. Please specify __both__ `--cache_latents` and `--cache_latents_to_disk` options.
-        - The files are saved in the same folder as the images with the extension `.npz`. If you specify the `--flip_aug` option, the files with `_flip.npz` will also be saved.
-        - Multi-GPU training has not been tested.
-        - This feature is not tested with all combinations of datasets and training scripts, so there may be bugs.
-    - Added workaround for an error that occurs when training with `fp16` or `bf16` in `fine_tune.py`.
-    - Implemented DyLoRA GUI support. There will now be a new 'DyLoRA Unit` slider when the LoRA type is selected as `kohya DyLoRA` to specify the desired Unit value for DyLoRA training.
-    - Update gui.bat and gui.ps1 based on: https://github.com/bmaltais/kohya_ss/issues/188
-    - Update `setup.bat` to install torch 2.0.0 instead of 1.2.1. If you want to upgrade from 1.2.1 to 2.0.0 run setup.bat again, select 1 to uninstall the previous torch modules, then select 2 for torch 2.0.0
+* 2023/05/28 (v21.5.15)
+- Show warning when image caption file does not exist during training. [PR #533](https://github.com/kohya-ss/sd-scripts/pull/533) Thanks to TingTingin!
+  - Warning is also displayed when using class+identifier dataset. Please ignore if it is intended.
+- `train_network.py` now supports merging network weights before training. [PR #542](https://github.com/kohya-ss/sd-scripts/pull/542) Thanks to u-haru!
+  - `--base_weights` option specifies LoRA or other model files (multiple files are allowed) to merge.
+  - `--base_weights_multiplier` option specifies multiplier of the weights to merge (multiple values are allowed). If omitted or less than `base_weights`, 1.0 is used.
+  - This is useful for incremental learning. See PR for details.
+- Show warning and continue training when uploading to HuggingFace fails.
+* 2023/05/28 (v21.5.14)
+- Add Create Groupo tool and GUI
+* 2023/05/24 (v21.5.13)
+- Upgrade gradio release to fix issue with UI refresh on config load.
+- [D-Adaptation v3.0](https://github.com/facebookresearch/dadaptation) is now supported. [PR #530](https://github.com/kohya-ss/sd-scripts/pull/530) Thanks to sdbds!
+  - `--optimizer_type` now accepts `DAdaptAdamPreprint`, `DAdaptAdanIP`, and `DAdaptLion`.
+  - `DAdaptAdam` is now new. The old `DAdaptAdam` is available with `DAdaptAdamPreprint`.
+  - Simply specifying `DAdaptation` will use `DAdaptAdamPreprint` (same behavior as before).
+  - You need to install D-Adaptation v3.0. After activating venv, please do `pip install -U dadaptation`.
+  - See PR and D-Adaptation documentation for details.
+* 2023/05/22 (v21.5.12)
+- Fixed several bugs.
+  - The state is saved even when the `--save_state` option is not specified in `fine_tune.py` and `train_db.py`. [PR #521](https://github.com/kohya-ss/sd-scripts/pull/521) Thanks to akshaal!
+  - Cannot load LoRA without `alpha`. [PR #527](https://github.com/kohya-ss/sd-scripts/pull/527) Thanks to Manjiz!
+  - Minor changes to console output during sample generation. [PR #515](https://github.com/kohya-ss/sd-scripts/pull/515) Thanks to yanhuifair!
+- The generation script now uses xformers for VAE as well.
+- Fixed an issue where an error would occur if the encoding of the prompt file was different from the default. [PR #510](https://github.com/kohya-ss/sd-scripts/pull/510) Thanks to sdbds!
+  - Please save the prompt file in UTF-8.
+* 2023/05/15 (v21.5.11)
+  - Added an option `--dim_from_weights` to `train_network.py` to automatically determine the dim(rank) from the weight file. [PR #491](https://github.com/kohya-ss/sd-scripts/pull/491) Thanks to AI-Casanova!
+    - It is useful in combination with `resize_lora.py`. Please see the PR for details.
+  - Fixed a bug where the noise resolution was incorrect with Multires noise. [PR #489](https://github.com/kohya-ss/sd-scripts/pull/489) Thanks to sdbds!
+    - Please see the PR for details.
+  - The image generation scripts can now use img2img and highres fix at the same time.
+  - Fixed a bug where the hint image of ControlNet was incorrectly BGR instead of RGB in the image generation scripts.
+  - Added a feature to the image generation scripts to use the memory-efficient VAE.
+    - If you specify a number with the `--vae_slices` option, the memory-efficient VAE will be used. The maximum output size will be larger, but it will be slower. Please specify a value of about `16` or `32`.
+    - The implementation of the VAE is in `library/slicing_vae.py`.
+  - Fix for wandb #ebabchick
+  - Added [English translation of documents](https://github.com/darkstorm2150/sd-scripts#links-to-usage-documentation) by darkstorm2150. Thank you very much!
+  - The prompt for sample generation during training can now be specified in `.toml` or `.json`. [PR #504](https://github.com/kohya-ss/sd-scripts/pull/504) Thanks to Linaqruf!
+    - For details on prompt description, please see the PR.
+* 2023/04/07 (v21.5.10)
+  - Fix issue https://github.com/bmaltais/kohya_ss/issues/734
+  - The documentation has been moved to the `docs` folder. If you have links, please change them.
+  - DAdaptAdaGrad, DAdaptAdan, and DAdaptSGD are now supported by DAdaptation. [PR#455](https://github.com/kohya-ss/sd-scripts/pull/455) Thanks to sdbds!
+    - DAdaptation needs to be installed. Also, depending on the optimizer, DAdaptation may need to be updated. Please update with `pip install --upgrade dadaptation`.
+  - Added support for pre-calculation of LoRA weights in image generation scripts. Specify `--network_pre_calc`.
+    - The prompt option `--am` is available. Also, it is disabled when Regional LoRA is used.
+  - Added Adaptive noise scale to each training script. Specify a number with `--adaptive_noise_scale` to enable it.
+    - __Experimental option. It may be removed or changed in the future.__
+    - This is an original implementation that automatically adjusts the value of the noise offset according to the absolute value of the mean of each channel of the latents. It is expected that appropriate noise offsets will be set for bright and dark images, respectively.
+    - Specify it together with `--noise_offset`.
+    - The actual value of the noise offset is calculated as `noise_offset + abs(mean(latents, dim=(2,3))) * adaptive_noise_scale`. Since the latent is close to a normal distribution, it may be a good idea to specify a value of about 1/10 to the same as the noise offset.
+    - Negative values can also be specified, in which case the noise offset will be clipped to 0 or more.
+  - Other minor fixes.
diff --git a/docker-compose.yaml b/docker-compose.yaml
new file mode 100644
index 000000000..afaa1bb74
--- /dev/null
+++ b/docker-compose.yaml
@@ -0,0 +1,30 @@
+version: "3.8"
+services:
+  kohya-ss-gui:
+    container_name: kohya-ss-gui
+    image: kohya-ss-gui:latest
+    build:
+      context: .
+    ports:
+      - 127.0.0.1:7860:7860
+    tty: true
+    ipc: host
+    environment:
+      CLI_ARGS: ""
+      SAFETENSORS_FAST_GPU: 1
+    tmpfs:
+      - /tmp      
+    volumes:
+      - ./dataset:/dataset
+      - ./.cache/user:/home/appuser/.cache
+      - ./.cache/triton:/home/appuser/.triton    
+      - ./.cache/config:/app/appuser/.config
+      - ./.cache/nv:/home/appuser/.nv 
+      - ./.cache/keras:/home/appuser/.keras      
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ['0']
+              capabilities: [gpu]
diff --git a/config_README-ja.md b/docs/config_README-ja.md
similarity index 100%
rename from config_README-ja.md
rename to docs/config_README-ja.md
diff --git a/fine_tune_README_ja.md b/docs/fine_tune_README_ja.md
similarity index 100%
rename from fine_tune_README_ja.md
rename to docs/fine_tune_README_ja.md
diff --git a/docs/gen_img_README-ja.md b/docs/gen_img_README-ja.md
new file mode 100644
index 000000000..cf35f1df7
--- /dev/null
+++ b/docs/gen_img_README-ja.md
@@ -0,0 +1,454 @@
+SD 1.xおよび2.xのモデル、当リポジトリで学習したLoRA、ControlNet（v1.0のみ動作確認）などに対応した、Diffusersベースの推論（画像生成）スクリプトです。コマンドラインから用います。
+
+# 概要
+
+* Diffusers (v0.10.2) ベースの推論（画像生成）スクリプト。
+* SD 1.xおよび2.x (base/v-parameterization)モデルに対応。
+* txt2img、img2img、inpaintingに対応。
+* 対話モード、およびファイルからのプロンプト読み込み、連続生成に対応。
+* プロンプト1行あたりの生成枚数を指定可能。
+* 全体の繰り返し回数を指定可能。
+* `fp16`だけでなく`bf16`にも対応。
+* xformersに対応し高速生成が可能。
+    * xformersにより省メモリ生成を行いますが、Automatic 1111氏のWeb UIほど最適化していないため、512*512の画像生成でおおむね6GB程度のVRAMを使用します。
+* プロンプトの225トークンへの拡張。ネガティブプロンプト、重みづけに対応。
+* Diffusersの各種samplerに対応（Web UIよりもsampler数は少ないです）。
+* Text Encoderのclip skip（最後からn番目の層の出力を用いる）に対応。
+* VAEの別途読み込み。
+* CLIP Guided Stable Diffusion、VGG16 Guided Stable Diffusion、Highres. fix、upscale対応。
+    * Highres. fixはWeb UIの実装を全く確認していない独自実装のため、出力結果は異なるかもしれません。
+* LoRA対応。適用率指定、複数LoRA同時利用、重みのマージに対応。
+    * Text EncoderとU-Netで別の適用率を指定することはできません。
+* Attention Coupleに対応。
+* ControlNet v1.0に対応。
+* 途中でモデルを切り替えることはできませんが、バッチファイルを組むことで対応できます。
+* 個人的に欲しくなった機能をいろいろ追加。
+
+機能追加時にすべてのテストを行っているわけではないため、以前の機能に影響が出て一部機能が動かない可能性があります。何か問題があればお知らせください。
+
+# 基本的な使い方
+
+## 対話モードでの画像生成
+
+以下のように入力してください。
+
+```batchfile
+python gen_img_diffusers.py --ckpt <モデル名> --outdir <画像出力先> --xformers --fp16 --interactive
+```
+
+`--ckpt`オプションにモデル（Stable Diffusionのcheckpointファイル、またはDiffusersのモデルフォルダ）、`--outdir`オプションに画像の出力先フォルダを指定します。
+
+`--xformers`オプションでxformersの使用を指定します（xformersを使わない場合は外してください）。`--fp16`オプションでfp16（単精度）での推論を行います。RTX 30系のGPUでは `--bf16`オプションでbf16（bfloat16）での推論を行うこともできます。
+
+`--interactive`オプションで対話モードを指定しています。
+
+Stable Diffusion 2.0（またはそこからの追加学習モデル）を使う場合は`--v2`オプションを追加してください。v-parameterizationを使うモデル（`768-v-ema.ckpt`およびそこからの追加学習モデル）を使う場合はさらに`--v_parameterization`を追加してください。
+
+`--v2`の指定有無が間違っているとモデル読み込み時にエラーになります。`--v_parameterization`の指定有無が間違っていると茶色い画像が表示されます。
+
+`Type prompt:`と表示されたらプロンプトを入力してください。
+
+![image](https://user-images.githubusercontent.com/52813779/235343115-f3b8ac82-456d-4aab-9724-0cc73c4534aa.png)
+
+※画像が表示されずエラーになる場合、headless（画面表示機能なし）のOpenCVがインストールされているかもしれません。`pip install opencv-python`として通常のOpenCVを入れてください。または`--no_preview`オプションで画像表示を止めてください。
+
+画像ウィンドウを選択してから何らかのキーを押すとウィンドウが閉じ、次のプロンプトが入力できます。プロンプトでCtrl+Z、エンターの順に打鍵するとスクリプトを閉じます。
+
+## 単一のプロンプトで画像を一括生成
+
+以下のように入力します（実際には1行で入力します）。
+
+```batchfile
+python gen_img_diffusers.py --ckpt <モデル名> --outdir <画像出力先> 
+    --xformers --fp16 --images_per_prompt <生成枚数> --prompt "<プロンプト>"
+```
+
+`--images_per_prompt`オプションで、プロンプト1件当たりの生成枚数を指定します。`--prompt`オプションでプロンプトを指定します。スペースを含む場合はダブルクォーテーションで囲んでください。
+
+`--batch_size`オプションでバッチサイズを指定できます（後述）。
+
+## ファイルからプロンプトを読み込み一括生成
+
+以下のように入力します。
+
+```batchfile
+python gen_img_diffusers.py --ckpt <モデル名> --outdir <画像出力先> 
+    --xformers --fp16 --from_file <プロンプトファイル名>
+```
+
+`--from_file`オプションで、プロンプトが記述されたファイルを指定します。1行1プロンプトで記述してください。`--images_per_prompt`オプションを指定して1行あたり生成枚数を指定できます。
+
+## ネガティブプロンプト、重みづけの使用
+
+プロンプトオプション（プロンプト内で`--x`のように指定、後述）で`--n`を書くと、以降がネガティブプロンプトとなります。
+
+またAUTOMATIC1111氏のWeb UIと同様の `()` や` []` 、`(xxx:1.3)` などによる重みづけが可能です（実装はDiffusersの[Long Prompt Weighting Stable Diffusion](https://github.com/huggingface/diffusers/blob/main/examples/community/README.md#long-prompt-weighting-stable-diffusion)からコピーしたものです）。
+
+コマンドラインからのプロンプト指定、ファイルからのプロンプト読み込みでも同様に指定できます。
+
+![image](https://user-images.githubusercontent.com/52813779/235343128-e79cd768-ec59-46f5-8395-fce9bdc46208.png)
+
+# 主なオプション
+
+コマンドラインから指定してください。
+
+## モデルの指定
+
+- `--ckpt <モデル名>`：モデル名を指定します。`--ckpt`オプションは必須です。Stable Diffusionのcheckpointファイル、またはDiffusersのモデルフォルダ、Hugging FaceのモデルIDを指定できます。
+
+- `--v2`：Stable Diffusion 2.x系のモデルを使う場合に指定します。1.x系の場合には指定不要です。
+
+- `--v_parameterization`：v-parameterizationを使うモデルを使う場合に指定します（`768-v-ema.ckpt`およびそこからの追加学習モデル、Waifu Diffusion v1.5など）。
+    
+    `--v2`の指定有無が間違っているとモデル読み込み時にエラーになります。`--v_parameterization`の指定有無が間違っていると茶色い画像が表示されます。
+
+- `--vae`：使用するVAEを指定します。未指定時はモデル内のVAEを使用します。
+
+## 画像生成と出力
+
+- `--interactive`：インタラクティブモードで動作します。プロンプトを入力すると画像が生成されます。
+
+- `--prompt <プロンプト>`：プロンプトを指定します。スペースを含む場合はダブルクォーテーションで囲んでください。
+
+- `--from_file <プロンプトファイル名>`：プロンプトが記述されたファイルを指定します。1行1プロンプトで記述してください。なお画像サイズやguidance scaleはプロンプトオプション（後述）で指定できます。
+
+- `--W <画像幅>`：画像の幅を指定します。デフォルトは`512`です。
+
+- `--H <画像高さ>`：画像の高さを指定します。デフォルトは`512`です。
+
+- `--steps <ステップ数>`：サンプリングステップ数を指定します。デフォルトは`50`です。
+
+- `--scale <ガイダンススケール>`：unconditionalガイダンススケールを指定します。デフォルトは`7.5`です。
+
+- `--sampler <サンプラー名>`：サンプラーを指定します。デフォルトは`ddim`です。Diffusersで提供されているddim、pndm、dpmsolver、dpmsolver+++、lms、euler、euler_a、が指定可能です（後ろの三つはk_lms、k_euler、k_euler_aでも指定できます）。
+
+- `--outdir <画像出力先フォルダ>`：画像の出力先を指定します。
+
+- `--images_per_prompt <生成枚数>`：プロンプト1件当たりの生成枚数を指定します。デフォルトは`1`です。
+
+- `--clip_skip <スキップ数>`：CLIPの後ろから何番目の層を使うかを指定します。省略時は最後の層を使います。
+
+- `--max_embeddings_multiples <倍数>`：CLIPの入出力長をデフォルト（75）の何倍にするかを指定します。未指定時は75のままです。たとえば3を指定すると入出力長が225になります。
+
+- `--negative_scale` : uncoditioningのguidance scaleを個別に指定します。[gcem156氏のこちらの記事](https://note.com/gcem156/n/ne9a53e4a6f43)を参考に実装したものです。
+
+## メモリ使用量や生成速度の調整
+
+- `--batch_size <バッチサイズ>`：バッチサイズを指定します。デフォルトは`1`です。バッチサイズが大きいとメモリを多く消費しますが、生成速度が速くなります。
+
+- `--vae_batch_size <VAEのバッチサイズ>`：VAEのバッチサイズを指定します。デフォルトはバッチサイズと同じです。
+    VAEのほうがメモリを多く消費するため、デノイジング後（stepが100%になった後）でメモリ不足になる場合があります。このような場合にはVAEのバッチサイズを小さくしてください。
+
+- `--xformers`：xformersを使う場合に指定します。
+
+- `--fp16`：fp16（単精度）での推論を行います。`fp16`と`bf16`をどちらも指定しない場合はfp32（単精度）での推論を行います。
+
+- `--bf16`：bf16（bfloat16）での推論を行います。RTX 30系のGPUでのみ指定可能です。`--bf16`オプションはRTX 30系以外のGPUではエラーになります。`fp16`よりも`bf16`のほうが推論結果がNaNになる（真っ黒の画像になる）可能性が低いようです。
+
+## 追加ネットワーク（LoRA等）の使用
+
+- `--network_module`：使用する追加ネットワークを指定します。LoRAの場合は`--network_module networks.lora`と指定します。複数のLoRAを使用する場合は`--network_module networks.lora networks.lora networks.lora`のように指定します。
+
+- `--network_weights`：使用する追加ネットワークの重みファイルを指定します。`--network_weights model.safetensors`のように指定します。複数のLoRAを使用する場合は`--network_weights model1.safetensors model2.safetensors model3.safetensors`のように指定します。引数の数は`--network_module`で指定した数と同じにしてください。
+
+- `--network_mul`：使用する追加ネットワークの重みを何倍にするかを指定します。デフォルトは`1`です。`--network_mul 0.8`のように指定します。複数のLoRAを使用する場合は`--network_mul 0.4 0.5 0.7`のように指定します。引数の数は`--network_module`で指定した数と同じにしてください。
+
+- `--network_merge`：使用する追加ネットワークの重みを`--network_mul`に指定した重みであらかじめマージします。`--network_pre_calc` と同時に使用できません。プロンプトオプションの`--am`、およびRegional LoRAは使用できなくなりますが、LoRA未使用時と同じ程度まで生成が高速化されます。
+
+- `--network_pre_calc`：使用する追加ネットワークの重みを生成ごとにあらかじめ計算します。プロンプトオプションの`--am`が使用できます。LoRA未使用時と同じ程度まで生成は高速化されますが、生成前に重みを計算する時間が必要で、またメモリ使用量も若干増加します。Regional LoRA使用時は無効になります 。
+
+# 主なオプションの指定例
+
+次は同一プロンプトで64枚をバッチサイズ4で一括生成する例です。
+
+```batchfile
+python gen_img_diffusers.py --ckpt model.ckpt --outdir outputs 
+    --xformers --fp16 --W 512 --H 704 --scale 12.5 --sampler k_euler_a 
+    --steps 32 --batch_size 4 --images_per_prompt 64 
+    --prompt "beautiful flowers --n monochrome"
+```
+
+次はファイルに書かれたプロンプトを、それぞれ10枚ずつ、バッチサイズ4で一括生成する例です。
+
+```batchfile
+python gen_img_diffusers.py --ckpt model.ckpt --outdir outputs 
+    --xformers --fp16 --W 512 --H 704 --scale 12.5 --sampler k_euler_a 
+    --steps 32 --batch_size 4 --images_per_prompt 10 
+    --from_file prompts.txt
+```
+
+Textual Inversion（後述）およびLoRAの使用例です。
+
+```batchfile
+python gen_img_diffusers.py --ckpt model.safetensors 
+    --scale 8 --steps 48 --outdir txt2img --xformers 
+    --W 512 --H 768 --fp16 --sampler k_euler_a 
+    --textual_inversion_embeddings goodembed.safetensors negprompt.pt 
+    --network_module networks.lora networks.lora 
+    --network_weights model1.safetensors model2.safetensors 
+    --network_mul 0.4 0.8 
+    --clip_skip 2 --max_embeddings_multiples 1 
+    --batch_size 8 --images_per_prompt 1 --interactive
+```
+
+# プロンプトオプション
+
+プロンプト内で、`--n`のように「ハイフンふたつ+アルファベットn文字」でプロンプトから各種オプションの指定が可能です。対話モード、コマンドライン、ファイル、いずれからプロンプトを指定する場合でも有効です。
+
+プロンプトのオプション指定`--n`の前後にはスペースを入れてください。
+
+- `--n`：ネガティブプロンプトを指定します。
+
+- `--w`：画像幅を指定します。コマンドラインからの指定を上書きします。
+
+- `--h`：画像高さを指定します。コマンドラインからの指定を上書きします。
+
+- `--s`：ステップ数を指定します。コマンドラインからの指定を上書きします。
+
+- `--d`：この画像の乱数seedを指定します。`--images_per_prompt`を指定している場合は「--d 1,2,3,4」のようにカンマ区切りで複数指定してください。
+    ※様々な理由により、Web UIとは同じ乱数seedでも生成される画像が異なる場合があります。
+
+- `--l`：guidance scaleを指定します。コマンドラインからの指定を上書きします。
+
+- `--t`：img2img（後述）のstrengthを指定します。コマンドラインからの指定を上書きします。
+
+- `--nl`：ネガティブプロンプトのguidance scaleを指定します（後述）。コマンドラインからの指定を上書きします。
+
+- `--am`：追加ネットワークの重みを指定します。コマンドラインからの指定を上書きします。複数の追加ネットワークを使用する場合は`--am 0.8,0.5,0.3`のように __カンマ区切りで__ 指定します。
+
+※これらのオプションを指定すると、バッチサイズよりも小さいサイズでバッチが実行される場合があります（これらの値が異なると一括生成できないため）。（あまり気にしなくて大丈夫ですが、ファイルからプロンプトを読み込み生成する場合は、これらの値が同一のプロンプトを並べておくと効率が良くなります。）
+
+例：
+```
+(masterpiece, best quality), 1girl, in shirt and plated skirt, standing at street under cherry blossoms, upper body, [from below], kind smile, looking at another, [goodembed] --n realistic, real life, (negprompt), (lowres:1.1), (worst quality:1.2), (low quality:1.1), bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, normal quality, jpeg artifacts, signature, watermark, username, blurry --w 960 --h 640 --s 28 --d 1
+```
+
+![image](https://user-images.githubusercontent.com/52813779/235343446-25654172-fff4-4aaf-977a-20d262b51676.png)
+
+# img2img
+
+## オプション
+
+- `--image_path`：img2imgに利用する画像を指定します。`--image_path template.png`のように指定します。フォルダを指定すると、そのフォルダの画像を順次利用します。
+
+- `--strength`：img2imgのstrengthを指定します。`--strength 0.8`のように指定します。デフォルトは`0.8`です。
+
+- `--sequential_file_name`：ファイル名を連番にするかどうかを指定します。指定すると生成されるファイル名が`im_000001.png`からの連番になります。
+
+- `--use_original_file_name`：指定すると生成ファイル名がオリジナルのファイル名と同じになります。
+
+## コマンドラインからの実行例
+
+```batchfile
+python gen_img_diffusers.py --ckpt trinart_characters_it4_v1_vae_merged.ckpt 
+    --outdir outputs --xformers --fp16 --scale 12.5 --sampler k_euler --steps 32 
+    --image_path template.png --strength 0.8 
+    --prompt "1girl, cowboy shot, brown hair, pony tail, brown eyes, 
+          sailor school uniform, outdoors 
+          --n lowres, bad anatomy, bad hands, error, missing fingers, cropped, 
+          worst quality, low quality, normal quality, jpeg artifacts, (blurry), 
+          hair ornament, glasses" 
+    --batch_size 8 --images_per_prompt 32
+```
+
+`--image_path`オプションにフォルダを指定すると、そのフォルダの画像を順次読み込みます。生成される枚数は画像枚数ではなく、プロンプト数になりますので、`--images_per_promptPPオプションを指定してimg2imgする画像の枚数とプロンプト数を合わせてください。
+
+ファイルはファイル名でソートして読み込みます。なおソート順は文字列順となりますので（`1.jpg→2.jpg→10.jpg`ではなく`1.jpg→10.jpg→2.jpg`の順）、頭を0埋めするなどしてご対応ください（`01.jpg→02.jpg→10.jpg`）。
+
+## img2imgを利用したupscale
+
+img2img時にコマンドラインオプションの`--W`と`--H`で生成画像サイズを指定すると、元画像をそのサイズにリサイズしてからimg2imgを行います。
+
+またimg2imgの元画像がこのスクリプトで生成した画像の場合、プロンプトを省略すると、元画像のメタデータからプロンプトを取得しそのまま用います。これによりHighres. fixの2nd stageの動作だけを行うことができます。
+
+## img2img時のinpainting
+
+画像およびマスク画像を指定してinpaintingできます（inpaintingモデルには対応しておらず、単にマスク領域を対象にimg2imgするだけです）。
+
+オプションは以下の通りです。
+
+- `--mask_image`：マスク画像を指定します。`--img_path`と同様にフォルダを指定すると、そのフォルダの画像を順次利用します。
+
+マスク画像はグレースケール画像で、白の部分がinpaintingされます。境界をグラデーションしておくとなんとなく滑らかになりますのでお勧めです。
+
+![image](https://user-images.githubusercontent.com/52813779/235343795-9eaa6d98-02ff-4f32-b089-80d1fc482453.png)
+
+# その他の機能
+
+## Textual Inversion
+
+`--textual_inversion_embeddings`オプションで使用するembeddingsを指定します（複数指定可）。拡張子を除いたファイル名をプロンプト内で使用することで、そのembeddingsを利用します（Web UIと同様の使用法です）。ネガティブプロンプト内でも使用できます。
+
+モデルとして、当リポジトリで学習したTextual Inversionモデル、およびWeb UIで学習したTextual Inversionモデル（画像埋め込みは非対応）を利用できます
+
+## Extended Textual Inversion
+
+`--textual_inversion_embeddings`の代わりに`--XTI_embeddings`オプションを指定してください。使用法は`--textual_inversion_embeddings`と同じです。
+
+## Highres. fix
+
+AUTOMATIC1111氏のWeb UIにある機能の類似機能です（独自実装のためもしかしたらいろいろ異なるかもしれません）。最初に小さめの画像を生成し、その画像を元にimg2imgすることで、画像全体の破綻を防ぎつつ大きな解像度の画像を生成します。
+
+2nd stageのstep数は`--steps` と`--strength`オプションの値から計算されます（`steps*strength`）。
+
+img2imgと併用できません。
+
+以下のオプションがあります。
+
+- `--highres_fix_scale`：Highres. fixを有効にして、1st stageで生成する画像のサイズを、倍率で指定します。最終出力が1024x1024で、最初に512x512の画像を生成する場合は`--highres_fix_scale 0.5`のように指定します。Web UI出の指定の逆数になっていますのでご注意ください。
+
+- `--highres_fix_steps`：1st stageの画像のステップ数を指定します。デフォルトは`28`です。
+
+- `--highres_fix_save_1st`：1st stageの画像を保存するかどうかを指定します。
+
+- `--highres_fix_latents_upscaling`：指定すると2nd stageの画像生成時に1st stageの画像をlatentベースでupscalingします（bilinearのみ対応）。未指定時は画像をLANCZOS4でupscalingします。
+
+- `--highres_fix_upscaler`：2nd stageに任意のupscalerを利用します。現在は`--highres_fix_upscaler tools.latent_upscaler` のみ対応しています。
+
+- `--highres_fix_upscaler_args`：`--highres_fix_upscaler`で指定したupscalerに渡す引数を指定します。
+    `tools.latent_upscaler`の場合は、`--highres_fix_upscaler_args "weights=D:\Work\SD\Models\others\etc\upscaler-v1-e100-220.safetensors"`のように重みファイルを指定します。 
+
+コマンドラインの例です。
+
+```batchfile
+python gen_img_diffusers.py  --ckpt trinart_characters_it4_v1_vae_merged.ckpt
+    --n_iter 1 --scale 7.5 --W 1024 --H 1024 --batch_size 1 --outdir ../txt2img 
+    --steps 48 --sampler ddim --fp16 
+    --xformers 
+    --images_per_prompt 1  --interactive 
+    --highres_fix_scale 0.5 --highres_fix_steps 28 --strength 0.5
+```
+
+## ControlNet
+
+現在はControlNet 1.0のみ動作確認しています。プリプロセスはCannyのみサポートしています。
+
+以下のオプションがあります。
+
+- `--control_net_models`：ControlNetのモデルファイルを指定します。
+    複数指定すると、それらをstepごとに切り替えて利用します（Web UIのControlNet拡張の実装と異なります）。diffと通常の両方をサポートします。
+
+- `--guide_image_path`：ControlNetに使うヒント画像を指定します。`--img_path`と同様にフォルダを指定すると、そのフォルダの画像を順次利用します。Canny以外のモデルの場合には、あらかじめプリプロセスを行っておいてください。
+
+- `--control_net_preps`：ControlNetのプリプロセスを指定します。`--control_net_models`と同様に複数指定可能です。現在はcannyのみ対応しています。対象モデルでプリプロセスを使用しない場合は `none` を指定します。
+   cannyの場合 `--control_net_preps canny_63_191`のように、閾値1と2を'_'で区切って指定できます。
+
+- `--control_net_weights`：ControlNetの適用時の重みを指定します（`1.0`で通常、`0.5`なら半分の影響力で適用）。`--control_net_models`と同様に複数指定可能です。
+
+- `--control_net_ratios`：ControlNetを適用するstepの範囲を指定します。`0.5`の場合は、step数の半分までControlNetを適用します。`--control_net_models`と同様に複数指定可能です。
+
+コマンドラインの例です。
+
+```batchfile
+python gen_img_diffusers.py --ckpt model_ckpt --scale 8 --steps 48 --outdir txt2img --xformers 
+    --W 512 --H 768 --bf16 --sampler k_euler_a 
+    --control_net_models diff_control_sd15_canny.safetensors --control_net_weights 1.0 
+    --guide_image_path guide.png --control_net_ratios 1.0 --interactive
+```
+
+## Attention Couple + Reginal LoRA
+
+プロンプトをいくつかの部分に分割し、それぞれのプロンプトを画像内のどの領域に適用するかを指定できる機能です。個別のオプションはありませんが、`mask_path`とプロンプトで指定します。
+
+まず、プロンプトで` AND `を利用して、複数部分を定義します。最初の3つに対して領域指定ができ、以降の部分は画像全体へ適用されます。ネガティブプロンプトは画像全体に適用されます。
+
+以下ではANDで3つの部分を定義しています。
+
+```
+shs 2girls, looking at viewer, smile AND bsb 2girls, looking back AND 2girls --n bad quality, worst quality
+```
+
+次にマスク画像を用意します。マスク画像はカラーの画像で、RGBの各チャネルがプロンプトのANDで区切られた部分に対応します。またあるチャネルの値がすべて0の場合、画像全体に適用されます。
+
+上記の例では、Rチャネルが`shs 2girls, looking at viewer, smile`、Gチャネルが`bsb 2girls, looking back`に、Bチャネルが`2girls`に対応します。次のようなマスク画像を使用すると、Bチャネルに指定がありませんので、`2girls`は画像全体に適用されます。
+
+![image](https://user-images.githubusercontent.com/52813779/235343061-b4dc9392-3dae-4831-8347-1e9ae5054251.png)
+
+マスク画像は`--mask_path`で指定します。現在は1枚のみ対応しています。指定した画像サイズに自動的にリサイズされ適用されます。
+
+ControlNetと組み合わせることも可能です（細かい位置指定にはControlNetとの組み合わせを推奨します）。
+
+LoRAを指定すると、`--network_weights`で指定した複数のLoRAがそれぞれANDの各部分に対応します。現在の制約として、LoRAの数はANDの部分の数と同じである必要があります。
+
+## CLIP Guided Stable Diffusion
+
+DiffusersのCommunity Examplesの[こちらのcustom pipeline](https://github.com/huggingface/diffusers/blob/main/examples/community/README.md#clip-guided-stable-diffusion)からソースをコピー、変更したものです。
+
+通常のプロンプトによる生成指定に加えて、追加でより大規模のCLIPでプロンプトのテキストの特徴量を取得し、生成中の画像の特徴量がそのテキストの特徴量に近づくよう、生成される画像をコントロールします（私のざっくりとした理解です）。大きめのCLIPを使いますのでVRAM使用量はかなり増加し（VRAM 8GBでは512*512でも厳しいかもしれません）、生成時間も掛かります。
+
+なお選択できるサンプラーはDDIM、PNDM、LMSのみとなります。
+
+`--clip_guidance_scale`オプションにどの程度、CLIPの特徴量を反映するかを数値で指定します。先のサンプルでは100になっていますので、そのあたりから始めて増減すると良いようです。
+
+デフォルトではプロンプトの先頭75トークン（重みづけの特殊文字を除く）がCLIPに渡されます。プロンプトの`--c`オプションで、通常のプロンプトではなく、CLIPに渡すテキストを別に指定できます（たとえばCLIPはDreamBoothのidentifier（識別子）や「1girl」などのモデル特有の単語は認識できないと思われますので、それらを省いたテキストが良いと思われます）。
+
+コマンドラインの例です。
+
+```batchfile
+python gen_img_diffusers.py  --ckpt v1-5-pruned-emaonly.ckpt --n_iter 1 
+    --scale 2.5 --W 512 --H 512 --batch_size 1 --outdir ../txt2img --steps 36  
+    --sampler ddim --fp16 --opt_channels_last --xformers --images_per_prompt 1  
+    --interactive --clip_guidance_scale 100
+```
+
+## CLIP Image Guided Stable Diffusion
+
+テキストではなくCLIPに別の画像を渡し、その特徴量に近づくよう生成をコントロールする機能です。`--clip_image_guidance_scale`オプションで適用量の数値を、`--guide_image_path`オプションでguideに使用する画像（ファイルまたはフォルダ）を指定してください。
+
+コマンドラインの例です。
+
+```batchfile
+python gen_img_diffusers.py  --ckpt trinart_characters_it4_v1_vae_merged.ckpt
+    --n_iter 1 --scale 7.5 --W 512 --H 512 --batch_size 1 --outdir ../txt2img 
+    --steps 80 --sampler ddim --fp16 --opt_channels_last --xformers 
+    --images_per_prompt 1  --interactive  --clip_image_guidance_scale 100 
+    --guide_image_path YUKA160113420I9A4104_TP_V.jpg
+```
+
+### VGG16 Guided Stable Diffusion
+
+指定した画像に近づくように画像生成する機能です。通常のプロンプトによる生成指定に加えて、追加でVGG16の特徴量を取得し、生成中の画像が指定したガイド画像に近づくよう、生成される画像をコントロールします。img2imgでの使用をお勧めします（通常の生成では画像がぼやけた感じになります）。CLIP Guided Stable Diffusionの仕組みを流用した独自の機能です。またアイデアはVGGを利用したスタイル変換から拝借しています。
+
+なお選択できるサンプラーはDDIM、PNDM、LMSのみとなります。
+
+`--vgg16_guidance_scale`オプションにどの程度、VGG16特徴量を反映するかを数値で指定します。試した感じでは100くらいから始めて増減すると良いようです。`--guide_image_path`オプションでguideに使用する画像（ファイルまたはフォルダ）を指定してください。
+
+複数枚の画像を一括でimg2img変換し、元画像をガイド画像とする場合、`--guide_image_path`と`--image_path`に同じ値を指定すればOKです。
+
+コマンドラインの例です。
+
+```batchfile
+python gen_img_diffusers.py --ckpt wd-v1-3-full-pruned-half.ckpt 
+    --n_iter 1 --scale 5.5 --steps 60 --outdir ../txt2img 
+    --xformers --sampler ddim --fp16 --W 512 --H 704 
+    --batch_size 1 --images_per_prompt 1 
+    --prompt "picturesque, 1girl, solo, anime face, skirt, beautiful face 
+        --n lowres, bad anatomy, bad hands, error, missing fingers, 
+        cropped, worst quality, low quality, normal quality, 
+        jpeg artifacts, blurry, 3d, bad face, monochrome --d 1" 
+    --strength 0.8 --image_path ..\src_image
+    --vgg16_guidance_scale 100 --guide_image_path ..\src_image 
+```
+
+`--vgg16_guidance_layerPで特徴量取得に使用するVGG16のレイヤー番号を指定できます（デフォルトは20でconv4-2のReLUです）。上の層ほど画風を表現し、下の層ほどコンテンツを表現するといわれています。
+
+![image](https://user-images.githubusercontent.com/52813779/235343813-3c1f0d7a-4fb3-4274-98e4-b92d76b551df.png)
+
+# その他のオプション
+
+- `--no_preview` : 対話モードでプレビュー画像を表示しません。OpenCVがインストールされていない場合や、出力されたファイルを直接確認する場合に指定してください。
+
+- `--n_iter` : 生成を繰り返す回数を指定します。デフォルトは1です。プロンプトをファイルから読み込むとき、複数回の生成を行いたい場合に指定します。
+
+- `--tokenizer_cache_dir` : トークナイザーのキャッシュディレクトリを指定します。（作業中）
+
+- `--seed` : 乱数seedを指定します。1枚生成時はその画像のseed、複数枚生成時は各画像のseedを生成するための乱数のseedになります（`--from_file`で複数画像生成するとき、`--seed`オプションを指定すると複数回実行したときに各画像が同じseedになります）。
+
+- `--iter_same_seed` : プロンプトに乱数seedの指定がないとき、`--n_iter`の繰り返し内ではすべて同じseedを使います。`--from_file`で指定した複数のプロンプト間でseedを統一して比較するときに使います。
+
+- `--diffusers_xformers` : Diffuserのxformersを使用します。
+
+- `--opt_channels_last` : 推論時にテンソルのチャンネルを最後に配置します。場合によっては高速化されることがあります。
+
+- `--network_show_meta` : 追加ネットワークのメタデータを表示します。
+
diff --git a/train_README-ja.md b/docs/train_README-ja.md
similarity index 93%
rename from train_README-ja.md
rename to docs/train_README-ja.md
index fd66458a1..b64b18082 100644
--- a/train_README-ja.md
+++ b/docs/train_README-ja.md
@@ -463,27 +463,6 @@ masterpiece, best quality, 1boy, in business suit, standing at street, looking b
 
     xformersオプションを指定するとxformersのCrossAttentionを用います。xformersをインストールしていない場合やエラーとなる場合（環境にもよりますが `mixed_precision="no"` の場合など）、代わりに `mem_eff_attn` オプションを指定すると省メモリ版CrossAttentionを使用します（xformersよりも速度は遅くなります）。
 
-- `--save_precision`
-
-    保存時のデータ精度を指定します。save_precisionオプションにfloat、fp16、bf16のいずれかを指定すると、その形式でモデルを保存します（DreamBooth、fine tuningでDiffusers形式でモデルを保存する場合は無効です）。モデルのサイズを削減したい場合などにお使いください。
-
-- `--save_every_n_epochs` / `--save_state` / `--resume`
-    save_every_n_epochsオプションに数値を指定すると、そのエポックごとに学習途中のモデルを保存します。
-
-    save_stateオプションを同時に指定すると、optimizer等の状態も含めた学習状態を合わせて保存します（保存したモデルからも学習再開できますが、それに比べると精度の向上、学習時間の短縮が期待できます）。保存先はフォルダになります。
-    
-    学習状態は保存先フォルダに `<output_name>-??????-state`（??????はエポック数）という名前のフォルダで出力されます。長時間にわたる学習時にご利用ください。
-
-    保存された学習状態から学習を再開するにはresumeオプションを使います。学習状態のフォルダ（`output_dir` ではなくその中のstateのフォルダ）を指定してください。
-
-    なおAcceleratorの仕様により、エポック数、global stepは保存されておらず、resumeしたときにも1からになりますがご容赦ください。
-
-- `--save_model_as` （DreamBooth, fine tuning のみ）
-
-    モデルの保存形式を`ckpt, safetensors, diffusers, diffusers_safetensors` から選べます。
-    
-    `--save_model_as=safetensors` のように指定します。Stable Diffusion形式（ckptまたはsafetensors）を読み込み、Diffusers形式で保存する場合、不足する情報はHugging Faceからv1.5またはv2.1の情報を落としてきて補完します。
-    
 - `--clip_skip`
     
     `2` を指定すると、Text Encoder (CLIP) の後ろから二番目の層の出力を用います。1またはオプション省略時は最後の層を用います。
@@ -502,6 +481,12 @@ masterpiece, best quality, 1boy, in business suit, standing at street, looking b
 
     clip_skipと同様に、モデルの学習状態と異なる長さで学習するには、ある程度の教師データ枚数、長めの学習時間が必要になると思われます。
 
+- `--weighted_captions`
+
+    指定するとAutomatic1111氏のWeb UIと同様の重み付きキャプションが有効になります。「Textual Inversion と XTI」以外の学習に使用できます。キャプションだけでなく DreamBooth 手法の token string でも有効です。
+
+    重みづけキャプションの記法はWeb UIとほぼ同じで、(abc)や[abc]、(abc:1.23)などが使用できます。入れ子も可能です。括弧内にカンマを含めるとプロンプトのshuffle/dropoutで括弧の対応付けがおかしくなるため、括弧内にはカンマを含めないでください。
+
 - `--persistent_data_loader_workers`
 
     Windows環境で指定するとエポック間の待ち時間が大幅に短縮されます。
@@ -527,12 +512,28 @@ masterpiece, best quality, 1boy, in business suit, standing at street, looking b
 
     その後ブラウザを開き、http://localhost:6006/ へアクセスすると表示されます。
 
+- `--log_with` / `--log_tracker_name`
+
+    学習ログの保存に関するオプションです。`tensorboard` だけでなく `wandb`への保存が可能です。詳細は [PR#428](https://github.com/kohya-ss/sd-scripts/pull/428)をご覧ください。
+
 - `--noise_offset`
 
     こちらの記事の実装になります: https://www.crosslabs.org//blog/diffusion-with-offset-noise
     
     全体的に暗い、明るい画像の生成結果が良くなる可能性があるようです。LoRA学習でも有効なようです。`0.1` 程度の値を指定するとよいようです。
 
+- `--adaptive_noise_scale` （実験的オプション）
+
+    Noise offsetの値を、latentsの各チャネルの平均値の絶対値に応じて自動調整するオプションです。`--noise_offset` と同時に指定することで有効になります。Noise offsetの値は `noise_offset + abs(mean(latents, dim=(2,3))) * adaptive_noise_scale` で計算されます。latentは正規分布に近いためnoise_offsetの1/10～同程度の値を指定するとよいかもしれません。
+
+    負の値も指定でき、その場合はnoise offsetは0以上にclipされます。
+
+- `--multires_noise_iterations` / `--multires_noise_discount`
+    
+    Multi resolution noise (pyramid noise)の設定です。詳細は [PR#471](https://github.com/kohya-ss/sd-scripts/pull/471) およびこちらのページ [Multi-Resolution Noise for Diffusion Model Training](https://wandb.ai/johnowhitaker/multires_noise/reports/Multi-Resolution-Noise-for-Diffusion-Model-Training--VmlldzozNjYyOTU2) を参照してください。
+    
+    `--multires_noise_iterations` に数値を指定すると有効になります。6~10程度の値が良いようです。`--multires_noise_discount` に0.1~0.3 程度の値（LoRA学習等比較的データセットが小さい場合のPR作者の推奨）、ないしは0.8程度の値（元記事の推奨）を指定してください（デフォルトは 0.3）。
+
 - `--debug_dataset`
 
     このオプションを付けることで学習を行う前に事前にどのような画像データ、キャプションで学習されるかを確認できます。Escキーを押すと終了してコマンドラインに戻ります。`S`キーで次のステップ（バッチ）、`E`キーで次のエポックに進みます。
@@ -545,14 +546,62 @@ masterpiece, best quality, 1boy, in business suit, standing at street, looking b
 
     DreamBoothおよびfine tuningでは、保存されるモデルはこのVAEを組み込んだものになります。
 
-- `--cache_latents`
+- `--cache_latents` / `--cache_latents_to_disk`
 
     使用VRAMを減らすためVAEの出力をメインメモリにキャッシュします。`flip_aug` 以外のaugmentationは使えなくなります。また全体の学習速度が若干速くなります。
 
+    cache_latents_to_diskを指定するとキャッシュをディスクに保存します。スクリプトを終了し、再度起動した場合もキャッシュが有効になります。
+
 - `--min_snr_gamma`
 
     Min-SNR Weighting strategyを指定します。詳細は[こちら](https://github.com/kohya-ss/sd-scripts/pull/308)を参照してください。論文では`5`が推奨されています。
 
+## モデルの保存に関する設定
+
+- `--save_precision`
+
+    保存時のデータ精度を指定します。save_precisionオプションにfloat、fp16、bf16のいずれかを指定すると、その形式でモデルを保存します（DreamBooth、fine tuningでDiffusers形式でモデルを保存する場合は無効です）。モデルのサイズを削減したい場合などにお使いください。
+
+- `--save_every_n_epochs` / `--save_state` / `--resume`
+
+    save_every_n_epochsオプションに数値を指定すると、そのエポックごとに学習途中のモデルを保存します。
+
+    save_stateオプションを同時に指定すると、optimizer等の状態も含めた学習状態を合わせて保存します（保存したモデルからも学習再開できますが、それに比べると精度の向上、学習時間の短縮が期待できます）。保存先はフォルダになります。
+    
+    学習状態は保存先フォルダに `<output_name>-??????-state`（??????はエポック数）という名前のフォルダで出力されます。長時間にわたる学習時にご利用ください。
+
+    保存された学習状態から学習を再開するにはresumeオプションを使います。学習状態のフォルダ（`output_dir` ではなくその中のstateのフォルダ）を指定してください。
+
+    なおAcceleratorの仕様により、エポック数、global stepは保存されておらず、resumeしたときにも1からになりますがご容赦ください。
+
+- `--save_every_n_steps`
+
+    save_every_n_stepsオプションに数値を指定すると、そのステップごとに学習途中のモデルを保存します。save_every_n_epochsと同時に指定できます。
+
+- `--save_model_as` （DreamBooth, fine tuning のみ）
+
+    モデルの保存形式を`ckpt, safetensors, diffusers, diffusers_safetensors` から選べます。
+    
+    `--save_model_as=safetensors` のように指定します。Stable Diffusion形式（ckptまたはsafetensors）を読み込み、Diffusers形式で保存する場合、不足する情報はHugging Faceからv1.5またはv2.1の情報を落としてきて補完します。
+
+- `--huggingface_repo_id` 等
+
+    huggingface_repo_idが指定されているとモデル保存時に同時にHuggingFaceにアップロードします。アクセストークンの取り扱いに注意してください（HuggingFaceのドキュメントを参照してください）。
+
+    他の引数をたとえば以下のように指定してください。
+
+    -   `--huggingface_repo_id "your-hf-name/your-model" --huggingface_path_in_repo "path" --huggingface_repo_type model --huggingface_repo_visibility private --huggingface_token hf_YourAccessTokenHere`
+
+    huggingface_repo_visibilityに`public`を指定するとリポジトリが公開されます。省略時または`private`（などpublic以外）を指定すると非公開になります。
+
+    `--save_state`オプション指定時に`--save_state_to_huggingface`を指定するとstateもアップロードします。
+
+    `--resume`オプション指定時に`--resume_from_huggingface`を指定するとHuggingFaceからstateをダウンロードして再開します。その時の --resumeオプションは `--resume {repo_id}/{path_in_repo}:{revision}:{repo_type}`になります。
+    
+    例: `--resume_from_huggingface --resume your-hf-name/your-model/path/test-000002-state:main:model`
+
+    `--async_upload`オプションを指定するとアップロードを非同期で行います。
+
 ## オプティマイザ関係
 
 - `--optimizer_type`
@@ -563,9 +612,16 @@ masterpiece, best quality, 1boy, in business suit, standing at street, looking b
     - 過去のバージョンの--use_8bit_adam指定時と同じ
     - Lion : https://github.com/lucidrains/lion-pytorch
     - 過去のバージョンの--use_lion_optimizer指定時と同じ
+    - Lion8bit : 引数は同上
     - SGDNesterov : [torch.optim.SGD](https://pytorch.org/docs/stable/generated/torch.optim.SGD.html), nesterov=True
     - SGDNesterov8bit : 引数は同上
-    - DAdaptation : https://github.com/facebookresearch/dadaptation
+    - DAdaptation(DAdaptAdamPreprint) : https://github.com/facebookresearch/dadaptation
+    - DAdaptAdam : 引数は同上
+    - DAdaptAdaGrad : 引数は同上
+    - DAdaptAdan : 引数は同上
+    - DAdaptAdanIP : 引数は同上
+    - DAdaptLion : 引数は同上
+    - DAdaptSGD : 引数は同上
     - AdaFactor : [Transformers AdaFactor](https://huggingface.co/docs/transformers/main_classes/optimizer_schedules)
     - 任意のオプティマイザ
 
diff --git a/train_README-zh.md b/docs/train_README-zh.md
similarity index 99%
rename from train_README-zh.md
rename to docs/train_README-zh.md
index dbd266060..678832d2b 100644
--- a/train_README-zh.md
+++ b/docs/train_README-zh.md
@@ -550,8 +550,14 @@ masterpiece, best quality, 1boy, in business suit, standing at street, looking b
     - Lion : https://github.com/lucidrains/lion-pytorch
     - 与过去版本中指定的 --use_lion_optimizer 相同
     - SGDNesterov : [torch.optim.SGD](https://pytorch.org/docs/stable/generated/torch.optim.SGD.html), nesterov=True
-    - SGDNesterov8bit : 引数同上
-    - DAdaptation : https://github.com/facebookresearch/dadaptation
+    - SGDNesterov8bit : 参数同上
+    - DAdaptation(DAdaptAdamPreprint) : https://github.com/facebookresearch/dadaptation
+    - DAdaptAdam : 参数同上
+    - DAdaptAdaGrad : 参数同上
+    - DAdaptAdan : 参数同上
+    - DAdaptAdanIP : 引数は同上
+    - DAdaptLion : 参数同上
+    - DAdaptSGD : 参数同上
     - AdaFactor : [Transformers AdaFactor](https://huggingface.co/docs/transformers/main_classes/optimizer_schedules)
     - 任何优化器
 
diff --git a/train_db_README-ja.md b/docs/train_db_README-ja.md
similarity index 100%
rename from train_db_README-ja.md
rename to docs/train_db_README-ja.md
diff --git a/docs/train_db_README-zh.md b/docs/train_db_README-zh.md
new file mode 100644
index 000000000..d8ea5f3ed
--- /dev/null
+++ b/docs/train_db_README-zh.md
@@ -0,0 +1,162 @@
+这是DreamBooth的指南。
+
+请同时查看[关于学习的通用文档](./train_README-zh.md)。
+
+# 概要
+
+DreamBooth是一种将特定主题添加到图像生成模型中进行学习，并使用特定识别子生成它的技术。论文链接。
+
+具体来说，它可以将角色和绘画风格等添加到Stable Diffusion模型中进行学习，并使用特定的单词（例如`shs`）来调用（呈现在生成的图像中）。
+
+脚本基于Diffusers的DreamBooth，但添加了以下功能（一些功能已在原始脚本中得到支持）。
+
+脚本的主要功能如下：
+
+- 使用8位Adam优化器和潜在变量的缓存来节省内存（与Shivam Shrirao版相似）。
+- 使用xformers来节省内存。
+- 不仅支持512x512，还支持任意尺寸的训练。
+- 通过数据增强来提高质量。
+- 支持DreamBooth和Text Encoder + U-Net的微调。
+- 支持以Stable Diffusion格式读写模型。
+- 支持Aspect Ratio Bucketing。
+- 支持Stable Diffusion v2.0。
+
+# 训练步骤
+
+请先参阅此存储库的README以进行环境设置。
+
+## 准备数据
+
+请参阅[有关准备训练数据的说明](./train_README-zh.md)。
+
+## 运行训练
+
+运行脚本。以下是最大程度地节省内存的命令（实际上，这将在一行中输入）。请根据需要修改每行。它似乎需要约12GB的VRAM才能运行。
+```
+accelerate launch --num_cpu_threads_per_process 1 train_db.py 
+    --pretrained_model_name_or_path=<.ckpt或.safetensord或Diffusers版模型的目录>
+    --dataset_config=<数据准备时创建的.toml文件>
+    --output_dir=<训练模型的输出目录>
+    --output_name=<训练模型输出时的文件名>
+    --save_model_as=safetensors 
+    --prior_loss_weight=1.0 
+    --max_train_steps=1600 
+    --learning_rate=1e-6 
+    --optimizer_type="AdamW8bit" 
+    --xformers 
+    --mixed_precision="fp16" 
+    --cache_latents 
+    --gradient_checkpointing
+```
+`num_cpu_threads_per_process` 通常应该设置为1。
+
+`pretrained_model_name_or_path` 指定要进行追加训练的基础模型。可以指定 Stable Diffusion 的 checkpoint 文件（.ckpt 或 .safetensors）、Diffusers 的本地模型目录或模型 ID（如 "stabilityai/stable-diffusion-2"）。
+
+`output_dir` 指定保存训练后模型的文件夹。在 `output_name` 中指定模型文件名，不包括扩展名。使用 `save_model_as` 指定以 safetensors 格式保存。
+
+在 `dataset_config` 中指定 `.toml` 文件。初始批处理大小应为 `1`，以减少内存消耗。
+
+`prior_loss_weight` 是正则化图像损失的权重。通常设为1.0。
+
+将要训练的步数 `max_train_steps` 设置为1600。在这里，学习率 `learning_rate` 被设置为1e-6。
+
+为了节省内存，设置 `mixed_precision="fp16"`（在 RTX30 系列及更高版本中也可以设置为 `bf16`）。同时指定 `gradient_checkpointing`。
+
+为了使用内存消耗较少的 8bit AdamW 优化器（将模型优化为适合于训练数据的状态），指定 `optimizer_type="AdamW8bit"`。
+
+指定 `xformers` 选项，并使用 xformers 的 CrossAttention。如果未安装 xformers 或出现错误（具体情况取决于环境，例如使用 `mixed_precision="no"`），则可以指定 `mem_eff_attn` 选项以使用省内存版的 CrossAttention（速度会变慢）。
+
+为了节省内存，指定 `cache_latents` 选项以缓存 VAE 的输出。
+
+如果有足够的内存，请编辑 `.toml` 文件将批处理大小增加到大约 `4`（可能会提高速度和精度）。此外，取消 `cache_latents` 选项可以进行数据增强。
+
+### 常用选项
+
+对于以下情况，请参阅“常用选项”部分。
+
+- 学习 Stable Diffusion 2.x 或其衍生模型。
+- 学习基于 clip skip 大于等于2的模型。
+- 学习超过75个令牌的标题。
+
+### 关于DreamBooth中的步数
+
+为了实现省内存化，该脚本中每个步骤的学习次数减半（因为学习和正则化的图像在训练时被分为不同的批次）。
+
+要进行与原始Diffusers版或XavierXiao的Stable Diffusion版几乎相同的学习，请将步骤数加倍。
+
+（虽然在将学习图像和正则化图像整合后再打乱顺序，但我认为对学习没有太大影响。）
+
+关于DreamBooth的批量大小
+
+与像LoRA这样的学习相比，为了训练整个模型，内存消耗量会更大（与微调相同）。
+
+关于学习率
+
+在Diffusers版中，学习率为5e-6，而在Stable Diffusion版中为1e-6，因此在上面的示例中指定了1e-6。
+
+当使用旧格式的数据集指定命令行时
+
+使用选项指定分辨率和批量大小。命令行示例如下。
+```
+accelerate launch --num_cpu_threads_per_process 1 train_db.py 
+    --pretrained_model_name_or_path=<.ckpt或.safetensord或Diffusers版模型的目录> 
+    --train_data_dir=<训练数据的目录> 
+    --reg_data_dir=<正则化图像的目录> 
+    --output_dir=<训练后模型的输出目录> 
+    --output_name=<训练后模型输出文件的名称>  
+    --prior_loss_weight=1.0 
+    --resolution=512 
+    --train_batch_size=1 
+    --learning_rate=1e-6 
+    --max_train_steps=1600 
+    --use_8bit_adam 
+    --xformers 
+    --mixed_precision="bf16" 
+    --cache_latents
+    --gradient_checkpointing
+```
+
+## 使用训练好的模型生成图像
+
+训练完成后，将在指定的文件夹中以指定的名称输出safetensors文件。
+
+对于v1.4/1.5和其他派生模型，可以在此模型中使用Automatic1111先生的WebUI进行推断。请将其放置在models\Stable-diffusion文件夹中。
+
+对于使用v2.x模型在WebUI中生成图像的情况，需要单独的.yaml文件来描述模型的规格。对于v2.x base，需要v2-inference.yaml，对于768/v，则需要v2-inference-v.yaml。请将它们放置在相同的文件夹中，并将文件扩展名之前的部分命名为与模型相同的名称。
+![image](https://user-images.githubusercontent.com/52813779/210776915-061d79c3-6582-42c2-8884-8b91d2f07313.png)
+
+每个yaml文件都在[Stability AI的SD2.0存储库](https://github.com/Stability-AI/stablediffusion/tree/main/configs/stable-diffusion)……之中。
+
+# DreamBooth的其他主要选项
+
+有关所有选项的详细信息，请参阅另一份文档。
+
+## 不在中途开始对文本编码器进行训练 --stop_text_encoder_training
+
+如果在stop_text_encoder_training选项中指定一个数字，则在该步骤之后，将不再对文本编码器进行训练，只会对U-Net进行训练。在某些情况下，可能会期望提高精度。
+
+（我们推测可能会有时候仅仅文本编码器会过度学习，而这样做可以避免这种情况，但详细影响尚不清楚。）
+
+## 不进行分词器的填充 --no_token_padding
+
+如果指定no_token_padding选项，则不会对分词器的输出进行填充（与Diffusers版本的旧DreamBooth相同）。
+
+<!-- 
+如果使用分桶（bucketing）和数据增强（augmentation），则使用示例如下：
+```
+accelerate launch --num_cpu_threads_per_process 8 train_db.py 
+    --pretrained_model_name_or_path=<.ckpt或.safetensord或Diffusers版模型的目录> 
+    --train_data_dir=<训练数据的目录> 
+    --reg_data_dir=<正则化图像的目录> 
+    --output_dir=<训练后模型的输出目录>
+    --resolution=768,512 
+    --train_batch_size=20 --learning_rate=5e-6 --max_train_steps=800 
+    --use_8bit_adam --xformers --mixed_precision="bf16" 
+    --save_every_n_epochs=1 --save_state --save_precision="bf16" 
+    --logging_dir=logs 
+    --enable_bucket --min_bucket_reso=384 --max_bucket_reso=1280 
+    --color_aug --flip_aug --gradient_checkpointing --seed 42
+```
+
+
+-->
diff --git a/train_network_README-ja.md b/docs/train_network_README-ja.md
similarity index 98%
rename from train_network_README-ja.md
rename to docs/train_network_README-ja.md
index cb7cd726b..e620a8642 100644
--- a/train_network_README-ja.md
+++ b/docs/train_network_README-ja.md
@@ -276,7 +276,9 @@ python networks\merge_lora.py --sd_model ..\model\model.ckpt
 
 ### 複数のLoRAのモデルをマージする
 
-複数のLoRAモデルをひとつずつSDモデルに適用する場合と、複数のLoRAモデルをマージしてからSDモデルにマージする場合とは、計算順序の関連で微妙に異なる結果になります。
+__複数のLoRAをマージする場合は原則として `svd_merge_lora.py` を使用してください。__ 単純なup同士やdown同士のマージでは、計算結果が正しくなくなるためです。
+
+`merge_lora.py` によるマージは差分抽出法でLoRAを生成する場合等、ごく限られた場合でのみ有効です。
 
 たとえば以下のようなコマンドラインになります。
 
@@ -294,7 +296,7 @@ python networks\merge_lora.py
 
 --ratiosにそれぞれのモデルの比率（どのくらい重みを元モデルに反映するか）を0~1.0の数値で指定します。二つのモデルを一対一でマージす場合は、「0.5 0.5」になります。「1.0 1.0」では合計の重みが大きくなりすぎて、恐らく結果はあまり望ましくないものになると思われます。
 
-v1で学習したLoRAとv2で学習したLoRA、rank（次元数）や``alpha``の異なるLoRAはマージできません。U-NetだけのLoRAとU-Net+Text EncoderのLoRAはマージできるはずですが、結果は未知数です。
+v1で学習したLoRAとv2で学習したLoRA、rank（次元数）の異なるLoRAはマージできません。U-NetだけのLoRAとU-Net+Text EncoderのLoRAはマージできるはずですが、結果は未知数です。
 
 
 ### その他のオプション
diff --git a/docs/train_network_README-zh.md b/docs/train_network_README-zh.md
new file mode 100644
index 000000000..ed7a0c4ef
--- /dev/null
+++ b/docs/train_network_README-zh.md
@@ -0,0 +1,466 @@
+# 关于LoRA的学习。
+
+[LoRA: Low-Rank Adaptation of Large Language Models](https://arxiv.org/abs/2106.09685)（arxiv）、[LoRA](https://github.com/microsoft/LoRA)（github）这是应用于Stable Diffusion“稳定扩散”的内容。
+
+[cloneofsimo先生的代码仓库](https://github.com/cloneofsimo/lora) 我们非常感謝您提供的参考。非常感謝。
+
+通常情況下，LoRA只适用于Linear和Kernel大小为1x1的Conv2d，但也可以將其擴展到Kernel大小为3x3的Conv2d。
+
+Conv2d 3x3的扩展最初是由 [cloneofsimo先生的代码仓库](https://github.com/cloneofsimo/lora) 
+而KohakuBlueleaf先生在[LoCon](https://github.com/KohakuBlueleaf/LoCon)中揭示了其有效性。我们深深地感谢KohakuBlueleaf先生。
+
+看起来即使在8GB VRAM上也可以勉强运行。
+
+请同时查看关于[学习的通用文档](./train_README-zh.md)。
+# 可学习的LoRA 类型
+
+支持以下两种类型。以下是本仓库中自定义的名称。
+
+1. __LoRA-LierLa__：(用于 __Li__ n __e__ a __r__  __La__ yers 的 LoRA，读作 "Liela")
+
+    适用于 Linear 和卷积层 Conv2d 的 1x1 Kernel 的 LoRA
+
+2. __LoRA-C3Lier__：(用于具有 3x3 Kernel 的卷积层和 __Li__ n __e__ a __r__ 层的 LoRA，读作 "Seria")
+
+    除了第一种类型外，还适用于 3x3 Kernel 的 Conv2d 的 LoRA
+
+与 LoRA-LierLa 相比，LoRA-C3Lier 可能会获得更高的准确性，因为它适用于更多的层。
+
+在训练时，也可以使用 __DyLoRA__（将在后面介绍）。
+
+## 请注意与所学模型相关的事项。
+
+LoRA-LierLa可以用于AUTOMATIC1111先生的Web UI LoRA功能。
+
+要使用LoRA-C3Liar并在Web UI中生成，请使用此处的[WebUI用extension](https://github.com/kohya-ss/sd-webui-additional-networks)。
+
+在此存储库的脚本中，您还可以预先将经过训练的LoRA模型合并到Stable Diffusion模型中。
+
+请注意，与cloneofsimo先生的存储库以及d8ahazard先生的[Stable-Diffusion-WebUI的Dreambooth扩展](https://github.com/d8ahazard/sd_dreambooth_extension)不兼容，因为它们进行了一些功能扩展（如下文所述）。
+
+# 学习步骤
+
+请先参考此存储库的README文件并进行环境设置。
+
+## 准备数据
+
+请参考 [关于准备学习数据](./train_README-zh.md)。
+
+## 网络训练
+
+使用`train_network.py`。
+
+在`train_network.py`中，使用`--network_module`选项指定要训练的模块名称。对于LoRA模块，它应该是`network.lora`，请指定它。
+
+请注意，学习率应该比通常的DreamBooth或fine tuning要高，建议指定为`1e-4`至`1e-3`左右。
+
+以下是命令行示例。
+
+```
+accelerate launch --num_cpu_threads_per_process 1 train_network.py 
+    --pretrained_model_name_or_path=<.ckpt或.safetensord或Diffusers版模型目录> 
+    --dataset_config=<数据集配置的.toml文件> 
+    --output_dir=<训练过程中的模型输出文件夹>  
+    --output_name=<训练模型输出时的文件名> 
+    --save_model_as=safetensors 
+    --prior_loss_weight=1.0 
+    --max_train_steps=400 
+    --learning_rate=1e-4 
+    --optimizer_type="AdamW8bit" 
+    --xformers 
+    --mixed_precision="fp16" 
+    --cache_latents 
+    --gradient_checkpointing
+    --save_every_n_epochs=1 
+    --network_module=networks.lora
+```
+
+在这个命令行中，LoRA-LierLa将会被训练。
+
+LoRA的模型将会被保存在通过`--output_dir`选项指定的文件夹中。关于其他选项和优化器等，请参阅[学习的通用文档](./train_README-zh.md)中的“常用选项”。
+
+此外，还可以指定以下选项：
+
+* `--network_dim`
+  * 指定LoRA的RANK（例如：`--network_dim=4`）。默认值为4。数值越大表示表现力越强，但需要更多的内存和时间来训练。而且不要盲目增加此数值。
+* `--network_alpha`
+  * 指定用于防止下溢并稳定训练的alpha值。默认值为1。如果与`network_dim`指定相同的值，则将获得与以前版本相同的行为。
+* `--persistent_data_loader_workers`
+  * 在Windows环境中指定可大幅缩短epoch之间的等待时间。
+* `--max_data_loader_n_workers`
+  * 指定数据读取进程的数量。进程数越多，数据读取速度越快，可以更有效地利用GPU，但会占用主存。默认值为“`8`或`CPU同步执行线程数-1`的最小值”，因此如果主存不足或GPU使用率超过90％，则应将这些数字降低到约`2`或`1`。
+* `--network_weights`
+  * 在训练之前读取预训练的LoRA权重，并在此基础上进行进一步的训练。
+* `--network_train_unet_only`
+  * 仅启用与U-Net相关的LoRA模块。在类似fine tuning的学习中指定此选项可能会很有用。
+* `--network_train_text_encoder_only`
+  * 仅启用与Text Encoder相关的LoRA模块。可能会期望Textual Inversion效果。
+* `--unet_lr`
+  * 当在U-Net相关的LoRA模块中使用与常规学习率（由`--learning_rate`选项指定）不同的学习率时，应指定此选项。
+* `--text_encoder_lr`
+  * 当在Text Encoder相关的LoRA模块中使用与常规学习率（由`--learning_rate`选项指定）不同的学习率时，应指定此选项。可能最好将Text Encoder的学习率稍微降低（例如5e-5）。
+* `--network_args`
+  * 可以指定多个参数。将在下面详细说明。
+
+当未指定`--network_train_unet_only`和`--network_train_text_encoder_only`时（默认情况），将启用Text Encoder和U-Net的两个LoRA模块。
+
+# 其他的学习方法
+
+## 学习 LoRA-C3Lier
+
+请使用以下方式
+
+```
+--network_args "conv_dim=4"
+```
+
+DyLoRA是在这篇论文中提出的[DyLoRA: Parameter Efficient Tuning of Pre-trained Models using Dynamic Search-Free Low-Rank Adaptation](​https://arxiv.org/abs/2210.07558)，
+[其官方实现可在这里找到](​https://github.com/huawei-noah/KD-NLP/tree/main/DyLoRA)。
+
+根据论文，LoRA的rank并不是越高越好，而是需要根据模型、数据集、任务等因素来寻找合适的rank。使用DyLoRA，可以同时在指定的维度(rank)下学习多种rank的LoRA，从而省去了寻找最佳rank的麻烦。
+
+本存储库的实现基于官方实现进行了自定义扩展（因此可能存在缺陷）。
+
+### 本存储库DyLoRA的特点
+
+DyLoRA训练后的模型文件与LoRA兼容。此外，可以从模型文件中提取多个低于指定维度(rank)的LoRA。
+
+DyLoRA-LierLa和DyLoRA-C3Lier均可训练。
+
+### 使用DyLoRA进行训练
+
+请指定与DyLoRA相对应的`network.dylora`，例如 `--network_module=networks.dylora`。
+
+此外，通过 `--network_args` 指定例如`--network_args "unit=4"`的参数。`unit`是划分rank的单位。例如，可以指定为`--network_dim=16 --network_args "unit=4"`。请将`unit`视为可以被`network_dim`整除的值（`network_dim`是`unit`的倍数）。
+
+如果未指定`unit`，则默认为`unit=1`。
+
+以下是示例说明。
+
+```
+--network_module=networks.dylora --network_dim=16 --network_args "unit=4"
+
+--network_module=networks.dylora --network_dim=32 --network_alpha=16 --network_args "unit=4"
+```
+
+对于DyLoRA-C3Lier，需要在 `--network_args` 中指定 `conv_dim`，例如 `conv_dim=4`。与普通的LoRA不同，`conv_dim`必须与`network_dim`具有相同的值。以下是一个示例描述：
+
+```
+--network_module=networks.dylora --network_dim=16 --network_args "conv_dim=16" "unit=4"
+
+--network_module=networks.dylora --network_dim=32 --network_alpha=16 --network_args "conv_dim=32" "conv_alpha=16" "unit=8"
+```
+
+例如，当使用dim=16、unit=4（如下所述）进行学习时，可以学习和提取4个rank的LoRA，即4、8、12和16。通过在每个提取的模型中生成图像并进行比较，可以选择最佳rank的LoRA。
+
+其他选项与普通的LoRA相同。
+
+*`unit`是本存储库的独有扩展，在DyLoRA中，由于预计相比同维度（rank）的普通LoRA，学习时间更长，因此将分割单位增加。
+
+### 从DyLoRA模型中提取LoRA模型
+
+请使用`networks`文件夹中的`extract_lora_from_dylora.py`。指定`unit`单位后，从DyLoRA模型中提取LoRA模型。
+
+例如，命令行如下：
+
+```powershell
+python networks\extract_lora_from_dylora.py --model "foldername/dylora-model.safetensors" --save_to "foldername/dylora-model-split.safetensors" --unit 4
+```
+
+`--model` 参数用于指定DyLoRA模型文件。`--save_to` 参数用于指定要保存提取的模型的文件名（rank值将附加到文件名中）。`--unit` 参数用于指定DyLoRA训练时的`unit`。 
+
+## 分层学习率
+
+请参阅PR＃355了解详细信息。
+
+您可以指定完整模型的25个块的权重。虽然第一个块没有对应的LoRA，但为了与分层LoRA应用等的兼容性，将其设为25个。此外，如果不扩展到conv2d3x3，则某些块中可能不存在LoRA，但为了统一描述，请始终指定25个值。
+
+请在 `--network_args` 中指定以下参数。
+
+- `down_lr_weight`：指定U-Net down blocks的学习率权重。可以指定以下内容：
+  - 每个块的权重：指定12个数字，例如`"down_lr_weight=0,0,0,0,0,0,1,1,1,1,1,1"`
+  - 从预设中指定：例如`"down_lr_weight=sine"`（使用正弦曲线指定权重）。可以指定sine、cosine、linear、reverse_linear、zeros。另外，添加 `+数字` 时，可以将指定的数字加上（变为0.25〜1.25）。
+- `mid_lr_weight`：指定U-Net mid block的学习率权重。只需指定一个数字，例如 `"mid_lr_weight=0.5"`。
+- `up_lr_weight`：指定U-Net up blocks的学习率权重。与down_lr_weight相同。
+- 省略指定的部分将被视为1.0。另外，如果将权重设为0，则不会创建该块的LoRA模块。
+- `block_lr_zero_threshold`：如果权重小于此值，则不会创建LoRA模块。默认值为0。
+
+### 分层学习率命令行指定示例：
+
+
+```powershell
+--network_args "down_lr_weight=0.5,0.5,0.5,0.5,1.0,1.0,1.0,1.0,1.5,1.5,1.5,1.5" "mid_lr_weight=2.0" "up_lr_weight=1.5,1.5,1.5,1.5,1.0,1.0,1.0,1.0,0.5,0.5,0.5,0.5"
+
+--network_args "block_lr_zero_threshold=0.1" "down_lr_weight=sine+.5" "mid_lr_weight=1.5" "up_lr_weight=cosine+.5"
+```
+
+###  Hierarchical Learning Rate指定的toml文件示例：
+
+```toml
+network_args = [ "down_lr_weight=0.5,0.5,0.5,0.5,1.0,1.0,1.0,1.0,1.5,1.5,1.5,1.5", "mid_lr_weight=2.0", "up_lr_weight=1.5,1.5,1.5,1.5,1.0,1.0,1.0,1.0,0.5,0.5,0.5,0.5",]
+
+network_args = [ "block_lr_zero_threshold=0.1", "down_lr_weight=sine+.5", "mid_lr_weight=1.5", "up_lr_weight=cosine+.5", ]
+```
+
+## 层次结构维度（rank）
+
+您可以指定完整模型的25个块的维度（rank）。与分层学习率一样，某些块可能不存在LoRA，但请始终指定25个值。
+
+请在 `--network_args` 中指定以下参数：
+
+- `block_dims`：指定每个块的维度（rank）。指定25个数字，例如 `"block_dims=2,2,2,2,4,4,4,4,6,6,6,6,8,6,6,6,6,4,4,4,4,2,2,2,2"`。
+- `block_alphas`：指定每个块的alpha。与block_dims一样，指定25个数字。如果省略，将使用network_alpha的值。
+- `conv_block_dims`：将LoRA扩展到Conv2d 3x3，并指定每个块的维度（rank）。
+- `conv_block_alphas`：在将LoRA扩展到Conv2d 3x3时指定每个块的alpha。如果省略，将使用conv_alpha的值。
+
+### 层次结构维度（rank）命令行指定示例：
+
+
+```powershell
+--network_args "block_dims=2,4,4,4,8,8,8,8,12,12,12,12,16,12,12,12,12,8,8,8,8,4,4,4,2"
+
+--network_args "block_dims=2,4,4,4,8,8,8,8,12,12,12,12,16,12,12,12,12,8,8,8,8,4,4,4,2" "conv_block_dims=2,2,2,2,4,4,4,4,6,6,6,6,8,6,6,6,6,4,4,4,4,2,2,2,2"
+
+--network_args "block_dims=2,4,4,4,8,8,8,8,12,12,12,12,16,12,12,12,12,8,8,8,8,4,4,4,2" "block_alphas=2,2,2,2,4,4,4,4,6,6,6,6,8,6,6,6,6,4,4,4,4,2,2,2,2"
+```
+
+### 层级别dim(rank) toml文件指定示例：
+
+```toml
+network_args = [ "block_dims=2,4,4,4,8,8,8,8,12,12,12,12,16,12,12,12,12,8,8,8,8,4,4,4,2",]
+  
+network_args = [ "block_dims=2,4,4,4,8,8,8,8,12,12,12,12,16,12,12,12,12,8,8,8,8,4,4,4,2", "block_alphas=2,2,2,2,4,4,4,4,6,6,6,6,8,6,6,6,6,4,4,4,4,2,2,2,2",]
+```
+
+# Other scripts
+这些是与LoRA相关的脚本，如合并脚本等。
+
+关于合并脚本
+您可以使用merge_lora.py脚本将LoRA的训练结果合并到稳定扩散模型中，也可以将多个LoRA模型合并。
+
+合并到稳定扩散模型中的LoRA模型
+合并后的模型可以像常规的稳定扩散ckpt一样使用。例如，以下是一个命令行示例：
+
+```
+python networks\merge_lora.py --sd_model ..\model\model.ckpt 
+    --save_to ..\lora_train1\model-char1-merged.safetensors 
+    --models ..\lora_train1\last.safetensors --ratios 0.8
+```
+
+请使用 Stable Diffusion v2.x 模型进行训练并进行合并时，需要指定--v2选项。
+
+使用--sd_model选项指定要合并的 Stable Diffusion 模型文件（仅支持 .ckpt 或 .safetensors 格式，目前不支持 Diffusers）。
+
+使用--save_to选项指定合并后模型的保存路径（根据扩展名自动判断为 .ckpt 或 .safetensors）。
+
+使用--models选项指定已训练的 LoRA 模型文件，也可以指定多个，然后按顺序进行合并。
+
+使用--ratios选项以0~1.0的数字指定每个模型的应用率（将多大比例的权重反映到原始模型中）。例如，在接近过度拟合的情况下，降低应用率可能会使结果更好。请指定与模型数量相同的比率。 
+
+当指定多个模型时，格式如下：
+
+
+```
+python networks\merge_lora.py --sd_model ..\model\model.ckpt 
+    --save_to ..\lora_train1\model-char1-merged.safetensors 
+    --models ..\lora_train1\last.safetensors ..\lora_train2\last.safetensors --ratios 0.8 0.5
+```
+
+### 将多个LoRA模型合并
+
+将多个LoRA模型逐个应用于SD模型与将多个LoRA模型合并后再应用于SD模型之间，由于计算顺序的不同，会得到微妙不同的结果。
+
+例如，下面是一个命令行示例：
+
+```
+python networks\merge_lora.py 
+    --save_to ..\lora_train1\model-char1-style1-merged.safetensors 
+    --models ..\lora_train1\last.safetensors ..\lora_train2\last.safetensors --ratios 0.6 0.4
+```
+
+--sd_model选项不需要指定。
+
+通过--save_to选项指定合并后的LoRA模型的保存位置（.ckpt或.safetensors，根据扩展名自动识别）。
+
+通过--models选项指定学习的LoRA模型文件。可以指定三个或更多。
+
+通过--ratios选项以0~1.0的数字指定每个模型的比率（反映多少权重来自原始模型）。如果将两个模型一对一合并，则比率将是“0.5 0.5”。如果比率为“1.0 1.0”，则总重量将过大，可能会产生不理想的结果。
+
+在v1和v2中学习的LoRA，以及rank（维数）或“alpha”不同的LoRA不能合并。仅包含U-Net的LoRA和包含U-Net+文本编码器的LoRA可以合并，但结果未知。
+
+### 其他选项
+
+* 精度
+  * 可以从float、fp16或bf16中选择合并计算时的精度。默认为float以保证精度。如果想减少内存使用量，请指定fp16/bf16。
+* save_precision
+  * 可以从float、fp16或bf16中选择在保存模型时的精度。默认与精度相同。
+
+## 合并多个维度不同的LoRA模型
+
+将多个LoRA近似为一个LoRA（无法完全复制）。使用'svd_merge_lora.py'。例如，以下是命令行的示例。
+```
+python networks\svd_merge_lora.py 
+    --save_to ..\lora_train1\model-char1-style1-merged.safetensors 
+    --models ..\lora_train1\last.safetensors ..\lora_train2\last.safetensors 
+    --ratios 0.6 0.4 --new_rank 32 --device cuda
+```
+`merge_lora.py`和主要选项相同。以下选项已添加：
+
+- `--new_rank`
+  - 指定要创建的LoRA rank。
+- `--new_conv_rank`
+  - 指定要创建的Conv2d 3x3 LoRA的rank。如果省略，则与`new_rank`相同。
+- `--device`
+  - 如果指定为`--device cuda`，则在GPU上执行计算。处理速度将更快。
+
+## 在此存储库中生成图像的脚本中
+
+请在`gen_img_diffusers.py`中添加`--network_module`和`--network_weights`选项。其含义与训练时相同。
+
+通过`--network_mul`选项，可以指定0~1.0的数字来改变LoRA的应用率。
+
+## 请参考以下示例，在Diffusers的pipeline中生成。
+
+所需文件仅为networks/lora.py。请注意，该示例只能在Diffusers版本0.10.2中正常运行。
+
+```python
+import torch
+from diffusers import StableDiffusionPipeline
+from networks.lora import LoRAModule, create_network_from_weights
+from safetensors.torch import load_file
+
+# if the ckpt is CompVis based, convert it to Diffusers beforehand with tools/convert_diffusers20_original_sd.py. See --help for more details.
+
+model_id_or_dir = r"model_id_on_hugging_face_or_dir"
+device = "cuda"
+
+# create pipe
+print(f"creating pipe from {model_id_or_dir}...")
+pipe = StableDiffusionPipeline.from_pretrained(model_id_or_dir, revision="fp16", torch_dtype=torch.float16)
+pipe = pipe.to(device)
+vae = pipe.vae
+text_encoder = pipe.text_encoder
+unet = pipe.unet
+
+# load lora networks
+print(f"loading lora networks...")
+
+lora_path1 = r"lora1.safetensors"
+sd = load_file(lora_path1)   # If the file is .ckpt, use torch.load instead.
+network1, sd = create_network_from_weights(0.5, None, vae, text_encoder,unet, sd)
+network1.apply_to(text_encoder, unet)
+network1.load_state_dict(sd)
+network1.to(device, dtype=torch.float16)
+
+# # You can merge weights instead of apply_to+load_state_dict. network.set_multiplier does not work
+# network.merge_to(text_encoder, unet, sd)
+
+lora_path2 = r"lora2.safetensors"
+sd = load_file(lora_path2) 
+network2, sd = create_network_from_weights(0.7, None, vae, text_encoder,unet, sd)
+network2.apply_to(text_encoder, unet)
+network2.load_state_dict(sd)
+network2.to(device, dtype=torch.float16)
+
+lora_path3 = r"lora3.safetensors"
+sd = load_file(lora_path3)
+network3, sd = create_network_from_weights(0.5, None, vae, text_encoder,unet, sd)
+network3.apply_to(text_encoder, unet)
+network3.load_state_dict(sd)
+network3.to(device, dtype=torch.float16)
+
+# prompts
+prompt = "masterpiece, best quality, 1girl, in white shirt, looking at viewer"
+negative_prompt = "bad quality, worst quality, bad anatomy, bad hands"
+
+# exec pipe
+print("generating image...")
+with torch.autocast("cuda"):
+    image = pipe(prompt, guidance_scale=7.5, negative_prompt=negative_prompt).images[0]
+
+# if not merged, you can use set_multiplier
+# network1.set_multiplier(0.8)
+# and generate image again...
+
+# save image
+image.save(r"by_diffusers..png")
+```
+
+## 从两个模型的差异中创建LoRA模型。
+
+[参考讨论链接](https://github.com/cloneofsimo/lora/discussions/56)這是參考實現的結果。數學公式沒有改變（我並不完全理解，但似乎使用奇異值分解進行了近似）。
+
+将两个模型（例如微调原始模型和微调后的模型）的差异近似为LoRA。
+
+### 脚本执行方法
+
+请按以下方式指定。
+
+```
+python networks\extract_lora_from_models.py --model_org base-model.ckpt
+    --model_tuned fine-tuned-model.ckpt 
+    --save_to lora-weights.safetensors --dim 4
+```
+
+--model_org 选项指定原始的Stable Diffusion模型。如果要应用创建的LoRA模型，则需要指定该模型并将其应用。可以指定.ckpt或.safetensors文件。
+
+--model_tuned 选项指定要提取差分的目标Stable Diffusion模型。例如，可以指定经过Fine Tuning或DreamBooth后的模型。可以指定.ckpt或.safetensors文件。
+
+--save_to 指定LoRA模型的保存路径。--dim指定LoRA的维数。
+
+生成的LoRA模型可以像已训练的LoRA模型一样使用。
+
+当两个模型的文本编码器相同时，LoRA将成为仅包含U-Net的LoRA。
+
+### 其他选项
+
+- `--v2`
+  - 如果使用v2.x的稳定扩散模型，请指定此选项。
+- `--device`
+  - 指定为 ``--device cuda`` 可在GPU上执行计算。这会使处理速度更快（即使在CPU上也不会太慢，大约快几倍）。
+- `--save_precision`
+  - 指定LoRA的保存格式为“float”、“fp16”、“bf16”。如果省略，将使用float。
+- `--conv_dim`
+  - 指定后，将扩展LoRA的应用范围到Conv2d 3x3。指定Conv2d 3x3的rank。
+  - 
+## 图像大小调整脚本
+
+（稍后将整理文件，但现在先在这里写下说明。）
+
+在 Aspect Ratio Bucketing 的功能扩展中，现在可以将小图像直接用作教师数据，而无需进行放大。我收到了一个用于前处理的脚本，其中包括将原始教师图像缩小的图像添加到教师数据中可以提高准确性的报告。我整理了这个脚本并加入了感谢 bmaltais 先生。
+
+### 执行脚本的方法如下。
+原始图像以及调整大小后的图像将保存到转换目标文件夹中。调整大小后的图像将在文件名中添加“+512x512”之类的调整后的分辨率（与图像大小不同）。小于调整大小后分辨率的图像将不会被放大。
+
+```
+python tools\resize_images_to_resolution.py --max_resolution 512x512,384x384,256x256 --save_as_png 
+    --copy_associated_files 源图像文件夹目标文件夹
+```
+
+在元画像文件夹中的图像文件将被调整大小以达到指定的分辨率（可以指定多个），并保存到目标文件夹中。除图像外的文件将被保留为原样。
+
+请使用“--max_resolution”选项指定调整大小后的大小，使其达到指定的面积大小。如果指定多个，则会在每个分辨率上进行调整大小。例如，“512x512，384x384，256x256”将使目标文件夹中的图像变为原始大小和调整大小后的大小×3共计4张图像。
+
+如果使用“--save_as_png”选项，则会以PNG格式保存。如果省略，则默认以JPEG格式（quality=100）保存。
+
+如果使用“--copy_associated_files”选项，则会将与图像相同的文件名（例如标题等）的文件复制到调整大小后的图像文件的文件名相同的位置，但不包括扩展名。
+
+### 其他选项
+
+- divisible_by
+  - 将图像中心裁剪到能够被该值整除的大小（分别是垂直和水平的大小），以便调整大小后的图像大小可以被该值整除。
+- interpolation
+  - 指定缩小时的插值方法。可从``area、cubic、lanczos4``中选择，默认为``area``。
+
+
+# 追加信息
+
+## 与cloneofsimo的代码库的区别
+
+截至2022年12月25日，本代码库将LoRA应用扩展到了Text Encoder的MLP、U-Net的FFN以及Transformer的输入/输出投影中，从而增强了表现力。但是，内存使用量增加了，接近了8GB的限制。
+
+此外，模块交换机制也完全不同。
+
+## 关于未来的扩展
+
+除了LoRA之外，我们还计划添加其他扩展，以支持更多的功能。
diff --git a/train_ti_README-ja.md b/docs/train_ti_README-ja.md
similarity index 100%
rename from train_ti_README-ja.md
rename to docs/train_ti_README-ja.md
diff --git a/dreambooth_gui.py b/dreambooth_gui.py
index 4c1148b8f..2acc9157a 100644
--- a/dreambooth_gui.py
+++ b/dreambooth_gui.py
@@ -27,6 +27,7 @@
     # set_legacy_8bitadam,
     update_my_data,
     check_if_model_exist,
+    output_message,
 )
 from library.tensorboard_gui import (
     gradio_tensorboard,
@@ -38,7 +39,8 @@
 )
 from library.utilities import utilities_tab
 from library.sampler_gui import sample_gradio_config, run_cmd_sample
-from easygui import msgbox
+
+# from easygui import msgbox
 
 folder_symbol = '\U0001f4c2'  # 📂
 refresh_symbol = '\U0001f504'  # 🔄
@@ -68,6 +70,7 @@ def save_configuration(
     seed,
     num_cpu_threads_per_process,
     cache_latents,
+    cache_latents_to_disk,
     caption_extension,
     enable_bucket,
     gradient_checkpointing,
@@ -101,7 +104,9 @@ def save_configuration(
     caption_dropout_rate,
     optimizer,
     optimizer_args,
-    noise_offset,
+    noise_offset_type,noise_offset,adaptive_noise_scale,
+    multires_noise_iterations,
+    multires_noise_discount,
     sample_every_n_steps,
     sample_every_n_epochs,
     sample_sampler,
@@ -113,6 +118,8 @@ def save_configuration(
     save_every_n_steps,
     save_last_n_steps,
     save_last_n_steps_state,
+    use_wandb,
+    wandb_api_key,
 ):
     # Get list of function parameters and values
     parameters = list(locals().items())
@@ -181,6 +188,7 @@ def open_configuration(
     seed,
     num_cpu_threads_per_process,
     cache_latents,
+    cache_latents_to_disk,
     caption_extension,
     enable_bucket,
     gradient_checkpointing,
@@ -214,7 +222,9 @@ def open_configuration(
     caption_dropout_rate,
     optimizer,
     optimizer_args,
-    noise_offset,
+    noise_offset_type,noise_offset,adaptive_noise_scale,
+    multires_noise_iterations,
+    multires_noise_discount,
     sample_every_n_steps,
     sample_every_n_epochs,
     sample_sampler,
@@ -226,6 +236,8 @@ def open_configuration(
     save_every_n_steps,
     save_last_n_steps,
     save_last_n_steps_state,
+    use_wandb,
+    wandb_api_key,
 ):
     # Get list of function parameters and values
     parameters = list(locals().items())
@@ -257,6 +269,7 @@ def open_configuration(
 
 
 def train_model(
+    headless,
     pretrained_model_name_or_path,
     v2,
     v_parameterization,
@@ -276,6 +289,7 @@ def train_model(
     seed,
     num_cpu_threads_per_process,
     cache_latents,
+    cache_latents_to_disk,
     caption_extension,
     enable_bucket,
     gradient_checkpointing,
@@ -309,7 +323,9 @@ def train_model(
     caption_dropout_rate,
     optimizer,
     optimizer_args,
-    noise_offset,
+    noise_offset_type,noise_offset,adaptive_noise_scale,
+    multires_noise_iterations,
+    multires_noise_discount,
     sample_every_n_steps,
     sample_every_n_epochs,
     sample_sampler,
@@ -321,38 +337,66 @@ def train_model(
     save_every_n_steps,
     save_last_n_steps,
     save_last_n_steps_state,
+    use_wandb,
+    wandb_api_key,
 ):
+    headless_bool = True if headless.get('label') == 'True' else False
+
     if pretrained_model_name_or_path == '':
-        msgbox('Source model information is missing')
+        output_message(
+            msg='Source model information is missing', headless=headless_bool
+        )
         return
 
     if train_data_dir == '':
-        msgbox('Image folder path is missing')
+        output_message(
+            msg='Image folder path is missing', headless=headless_bool
+        )
         return
 
     if not os.path.exists(train_data_dir):
-        msgbox('Image folder does not exist')
+        output_message(
+            msg='Image folder does not exist', headless=headless_bool
+        )
         return
 
     if reg_data_dir != '':
         if not os.path.exists(reg_data_dir):
-            msgbox('Regularisation folder does not exist')
+            output_message(
+                msg='Regularisation folder does not exist',
+                headless=headless_bool,
+            )
             return
 
     if output_dir == '':
-        msgbox('Output folder path is missing')
+        output_message(
+            msg='Output folder path is missing', headless=headless_bool
+        )
         return
 
-    if check_if_model_exist(output_name, output_dir, save_model_as):
+    if check_if_model_exist(
+        output_name, output_dir, save_model_as, headless=headless_bool
+    ):
         return
 
     if optimizer == 'Adafactor' and lr_warmup != '0':
-        msgbox(
-            "Warning: lr_scheduler is set to 'Adafactor', so 'LR warmup (% of steps)' will be considered 0.",
+        output_message(
+            msg="Warning: lr_scheduler is set to 'Adafactor', so 'LR warmup (% of steps)' will be considered 0.",
             title='Warning',
+            headless=headless_bool,
         )
         lr_warmup = '0'
 
+    # if float(noise_offset) > 0 and (
+    #     multires_noise_iterations > 0 or multires_noise_discount > 0
+    # ):
+    #     output_message(
+    #         msg="noise offset and multires_noise can't be set at the same time. Only use one or the other.",
+    #         title='Error',
+    #         headless=headless_bool,
+    #     )
+    #     return
+
     # Get a list of all subfolders in train_data_dir, excluding hidden folders
     subfolders = [
         f
@@ -433,6 +477,7 @@ def train_model(
         math.ceil(
             float(total_steps)
             / int(train_batch_size)
+            / int(gradient_accumulation_steps)
             * int(epoch)
             * int(reg_factor)
         )
@@ -472,7 +517,8 @@ def train_model(
         run_cmd += f' --reg_data_dir="{reg_data_dir}"'
     run_cmd += f' --resolution={max_resolution}'
     run_cmd += f' --output_dir="{output_dir}"'
-    run_cmd += f' --logging_dir="{logging_dir}"'
+    if not logging_dir == '':
+        run_cmd += f' --logging_dir="{logging_dir}"'
     if not stop_text_encoder_training == 0:
         run_cmd += (
             f' --stop_text_encoder_training={stop_text_encoder_training}'
@@ -510,6 +556,7 @@ def train_model(
         seed=seed,
         caption_extension=caption_extension,
         cache_latents=cache_latents,
+        cache_latents_to_disk=cache_latents_to_disk,
         optimizer=optimizer,
         optimizer_args=optimizer_args,
     )
@@ -536,13 +583,19 @@ def train_model(
         bucket_reso_steps=bucket_reso_steps,
         caption_dropout_every_n_epochs=caption_dropout_every_n_epochs,
         caption_dropout_rate=caption_dropout_rate,
+        noise_offset_type=noise_offset_type,
         noise_offset=noise_offset,
+        adaptive_noise_scale=adaptive_noise_scale,
+        multires_noise_iterations=multires_noise_iterations,
+        multires_noise_discount=multires_noise_discount,
         additional_parameters=additional_parameters,
         vae_batch_size=vae_batch_size,
         min_snr_gamma=min_snr_gamma,
         save_every_n_steps=save_every_n_steps,
         save_last_n_steps=save_last_n_steps,
         save_last_n_steps_state=save_last_n_steps_state,
+        use_wandb=use_wandb,
+        wandb_api_key=wandb_api_key,
     )
 
     run_cmd += run_cmd_sample(
@@ -574,9 +627,11 @@ def dreambooth_tab(
     reg_data_dir=gr.Textbox(),
     output_dir=gr.Textbox(),
     logging_dir=gr.Textbox(),
+    headless=False,
 ):
     dummy_db_true = gr.Label(value=True, visible=False)
     dummy_db_false = gr.Label(value=False, visible=False)
+    dummy_headless = gr.Label(value=headless, visible=False)
     gr.Markdown('Train a custom model using kohya dreambooth python code...')
     (
         button_open_config,
@@ -584,7 +639,7 @@ def dreambooth_tab(
         button_save_as_config,
         config_file_name,
         button_load_config,
-    ) = gradio_config()
+    ) = gradio_config(headless=headless)
 
     (
         pretrained_model_name_or_path,
@@ -592,7 +647,7 @@ def dreambooth_tab(
         v_parameterization,
         save_model_as,
         model_list,
-    ) = gradio_source_model()
+    ) = gradio_source_model(headless=headless)
 
     with gr.Tab('Folders'):
         with gr.Row():
@@ -601,7 +656,7 @@ def dreambooth_tab(
                 placeholder='Folder where the training folders containing the images are located',
             )
             train_data_dir_input_folder = gr.Button(
-                '📂', elem_id='open_folder_small'
+                '📂', elem_id='open_folder_small', visible=(not headless)
             )
             train_data_dir_input_folder.click(
                 get_folder_path,
@@ -613,7 +668,7 @@ def dreambooth_tab(
                 placeholder='(Optional) Folder where where the regularization folders containing the images are located',
             )
             reg_data_dir_input_folder = gr.Button(
-                '📂', elem_id='open_folder_small'
+                '📂', elem_id='open_folder_small', visible=(not headless)
             )
             reg_data_dir_input_folder.click(
                 get_folder_path,
@@ -626,7 +681,7 @@ def dreambooth_tab(
                 placeholder='Folder to output trained model',
             )
             output_dir_input_folder = gr.Button(
-                '📂', elem_id='open_folder_small'
+                '📂', elem_id='open_folder_small', visible=(not headless)
             )
             output_dir_input_folder.click(get_folder_path, outputs=output_dir)
             logging_dir = gr.Textbox(
@@ -634,7 +689,7 @@ def dreambooth_tab(
                 placeholder='Optional: enable logging and output TensorBoard log to this folder',
             )
             logging_dir_input_folder = gr.Button(
-                '📂', elem_id='open_folder_small'
+                '📂', elem_id='open_folder_small', visible=(not headless)
             )
             logging_dir_input_folder.click(
                 get_folder_path,
@@ -682,6 +737,7 @@ def dreambooth_tab(
             seed,
             caption_extension,
             cache_latents,
+            cache_latents_to_disk,
             optimizer,
             optimizer_args,
         ) = gradio_training(
@@ -722,7 +778,9 @@ def dreambooth_tab(
                     label='VAE',
                     placeholder='(Optiona) path to checkpoint of vae to replace for training',
                 )
-                vae_button = gr.Button('📂', elem_id='open_folder_small')
+                vae_button = gr.Button(
+                    '📂', elem_id='open_folder_small', visible=(not headless)
+                )
                 vae_button.click(
                     get_any_file_path,
                     outputs=vae,
@@ -750,14 +808,18 @@ def dreambooth_tab(
                 bucket_reso_steps,
                 caption_dropout_every_n_epochs,
                 caption_dropout_rate,
-                noise_offset,
+                noise_offset_type,noise_offset,adaptive_noise_scale,
+                multires_noise_iterations,
+                multires_noise_discount,
                 additional_parameters,
                 vae_batch_size,
                 min_snr_gamma,
                 save_every_n_steps,
                 save_last_n_steps,
                 save_last_n_steps_state,
-            ) = gradio_advanced_training()
+                use_wandb,
+                wandb_api_key,
+            ) = gradio_advanced_training(headless=headless)
             color_aug.change(
                 color_aug_changed,
                 inputs=[color_aug],
@@ -780,6 +842,7 @@ def dreambooth_tab(
             reg_data_dir_input=reg_data_dir,
             output_dir_input=output_dir,
             logging_dir_input=logging_dir,
+            headless=headless,
         )
 
     button_run = gr.Button('Train model', variant='primary')
@@ -818,6 +881,7 @@ def dreambooth_tab(
         seed,
         num_cpu_threads_per_process,
         cache_latents,
+        cache_latents_to_disk,
         caption_extension,
         enable_bucket,
         gradient_checkpointing,
@@ -851,7 +915,9 @@ def dreambooth_tab(
         caption_dropout_rate,
         optimizer,
         optimizer_args,
-        noise_offset,
+        noise_offset_type,noise_offset,adaptive_noise_scale,
+        multires_noise_iterations,
+        multires_noise_discount,
         sample_every_n_steps,
         sample_every_n_epochs,
         sample_sampler,
@@ -863,6 +929,8 @@ def dreambooth_tab(
         save_every_n_steps,
         save_last_n_steps,
         save_last_n_steps_state,
+        use_wandb,
+        wandb_api_key,
     ]
 
     button_open_config.click(
@@ -895,7 +963,7 @@ def dreambooth_tab(
 
     button_run.click(
         train_model,
-        inputs=settings_list,
+        inputs=[dummy_headless] + settings_list,
         show_progress=False,
     )
 
@@ -910,12 +978,17 @@ def dreambooth_tab(
 def UI(**kwargs):
     css = ''
 
+    headless = kwargs.get('headless', False)
+    print(f'headless: {headless}')
+
     if os.path.exists('./style.css'):
         with open(os.path.join('./style.css'), 'r', encoding='utf8') as file:
             print('Load CSS...')
             css += file.read() + '\n'
 
-    interface = gr.Blocks(css=css)
+    interface = gr.Blocks(
+        css=css, title='Kohya_ss GUI', theme=gr.themes.Default()
+    )
 
     with interface:
         with gr.Tab('Dreambooth'):
@@ -924,7 +997,7 @@ def UI(**kwargs):
                 reg_data_dir_input,
                 output_dir_input,
                 logging_dir_input,
-            ) = dreambooth_tab()
+            ) = dreambooth_tab(headless=headless)
         with gr.Tab('Utilities'):
             utilities_tab(
                 train_data_dir_input=train_data_dir_input,
@@ -932,26 +1005,39 @@ def UI(**kwargs):
                 output_dir_input=output_dir_input,
                 logging_dir_input=logging_dir_input,
                 enable_copy_info_button=True,
+                headless=headless,
             )
 
     # Show the interface
     launch_kwargs = {}
-    if not kwargs.get('username', None) == '':
-        launch_kwargs['auth'] = (
-            kwargs.get('username', None),
-            kwargs.get('password', None),
-        )
-    if kwargs.get('server_port', 0) > 0:
-        launch_kwargs['server_port'] = kwargs.get('server_port', 0)
-    if kwargs.get('inbrowser', False):
-        launch_kwargs['inbrowser'] = kwargs.get('inbrowser', False)
-    print(launch_kwargs)
+    username = kwargs.get('username')
+    password = kwargs.get('password')
+    server_port = kwargs.get('server_port', 0)
+    inbrowser = kwargs.get('inbrowser', False)
+    share = kwargs.get('share', False)
+    server_name = kwargs.get('listen')
+
+    launch_kwargs['server_name'] = server_name
+    if username and password:
+        launch_kwargs['auth'] = (username, password)
+    if server_port > 0:
+        launch_kwargs['server_port'] = server_port
+    if inbrowser:
+        launch_kwargs['inbrowser'] = inbrowser
+    if share:
+        launch_kwargs['share'] = share
     interface.launch(**launch_kwargs)
 
 
 if __name__ == '__main__':
     # torch.cuda.set_per_process_memory_fraction(0.48)
     parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--listen',
+        type=str,
+        default='127.0.0.1',
+        help='IP to listen on for connections to Gradio',
+    )
     parser.add_argument(
         '--username', type=str, default='', help='Username for authentication'
     )
@@ -967,6 +1053,12 @@ def UI(**kwargs):
     parser.add_argument(
         '--inbrowser', action='store_true', help='Open in browser'
     )
+    parser.add_argument(
+        '--share', action='store_true', help='Share the gradio UI'
+    )
+    parser.add_argument(
+        '--headless', action='store_true', help='Is the server headless'
+    )
 
     args = parser.parse_args()
 
@@ -975,4 +1067,7 @@ def UI(**kwargs):
         password=args.password,
         inbrowser=args.inbrowser,
         server_port=args.server_port,
+        share=args.share,
+        listen=args.listen,
+        headless=args.headless,
     )
diff --git a/fine_tune.py b/fine_tune.py
index b6a8d1d7c..154d3be72 100644
--- a/fine_tune.py
+++ b/fine_tune.py
@@ -21,7 +21,7 @@
     BlueprintGenerator,
 )
 import library.custom_train_functions as custom_train_functions
-from library.custom_train_functions import apply_snr_weight, get_weighted_text_embeddings
+from library.custom_train_functions import apply_snr_weight, get_weighted_text_embeddings, pyramid_noise_like, apply_noise_offset
 
 
 def train(args):
@@ -90,7 +90,7 @@ def train(args):
     weight_dtype, save_dtype = train_util.prepare_dtype(args)
 
     # モデルを読み込む
-    text_encoder, vae, unet, load_stable_diffusion_format = train_util.load_target_model(args, weight_dtype)
+    text_encoder, vae, unet, load_stable_diffusion_format = train_util.load_target_model(args, weight_dtype, accelerator)
 
     # verify load/save model formats
     if load_stable_diffusion_format:
@@ -228,6 +228,9 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
     else:
         unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(unet, optimizer, train_dataloader, lr_scheduler)
 
+    # transform DDP after prepare
+    text_encoder, unet = train_util.transform_if_model_is_DDP(text_encoder, unet)
+
     # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
     if args.full_fp16:
         train_util.patch_accelerator_for_fp16_training(accelerator)
@@ -263,7 +266,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
         accelerator.init_trackers("finetuning" if args.log_tracker_name is None else args.log_tracker_name)
 
     for epoch in range(num_train_epochs):
-        print(f"epoch {epoch+1}/{num_train_epochs}")
+        print(f"\nepoch {epoch+1}/{num_train_epochs}")
         current_epoch.value = epoch + 1
 
         for m in training_models:
@@ -302,8 +305,9 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
                 # Sample noise that we'll add to the latents
                 noise = torch.randn_like(latents, device=latents.device)
                 if args.noise_offset:
-                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
-                    noise += args.noise_offset * torch.randn((latents.shape[0], latents.shape[1], 1, 1), device=latents.device)
+                    noise = apply_noise_offset(latents, noise, args.noise_offset, args.adaptive_noise_scale)
+                elif args.multires_noise_iterations:
+                    noise = pyramid_noise_like(noise, latents.device, args.multires_noise_iterations, args.multires_noise_discount)
 
                 # Sample a random timestep for each image
                 timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (b_size,), device=latents.device)
@@ -376,7 +380,7 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
             current_loss = loss.detach().item()  # 平均なのでbatch sizeは関係ないはず
             if args.logging_dir is not None:
                 logs = {"loss": current_loss, "lr": float(lr_scheduler.get_last_lr()[0])}
-                if args.optimizer_type.lower() == "DAdaptation".lower():  # tracking d*lr value
+                if args.optimizer_type.lower().startswith("DAdapt".lower()):  # tracking d*lr value
                     logs["lr/d*lr"] = (
                         lr_scheduler.optimizers[0].param_groups[0]["d"] * lr_scheduler.optimizers[0].param_groups[0]["lr"]
                     )
diff --git a/finetune_gui.py b/finetune_gui.py
index e94157857..525d62a3b 100644
--- a/finetune_gui.py
+++ b/finetune_gui.py
@@ -21,6 +21,7 @@
     # set_legacy_8bitadam,
     update_my_data,
     check_if_model_exist,
+    output_message,
 )
 from library.tensorboard_gui import (
     gradio_tensorboard,
@@ -29,7 +30,8 @@
 )
 from library.utilities import utilities_tab
 from library.sampler_gui import sample_gradio_config, run_cmd_sample
-from easygui import msgbox
+
+# from easygui import msgbox
 
 folder_symbol = '\U0001f4c2'  # 📂
 refresh_symbol = '\U0001f504'  # 🔄
@@ -90,6 +92,7 @@ def save_configuration(
     color_aug,
     model_list,
     cache_latents,
+    cache_latents_to_disk,
     use_latent_files,
     keep_tokens,
     persistent_data_loader_workers,
@@ -100,7 +103,9 @@ def save_configuration(
     caption_dropout_rate,
     optimizer,
     optimizer_args,
-    noise_offset,
+    noise_offset_type,noise_offset,adaptive_noise_scale,
+    multires_noise_iterations,
+    multires_noise_discount,
     sample_every_n_steps,
     sample_every_n_epochs,
     sample_sampler,
@@ -112,6 +117,8 @@ def save_configuration(
     save_every_n_steps,
     save_last_n_steps,
     save_last_n_steps_state,
+    use_wandb,
+    wandb_api_key,
 ):
     # Get list of function parameters and values
     parameters = list(locals().items())
@@ -209,6 +216,7 @@ def open_configuration(
     color_aug,
     model_list,
     cache_latents,
+    cache_latents_to_disk,
     use_latent_files,
     keep_tokens,
     persistent_data_loader_workers,
@@ -219,7 +227,9 @@ def open_configuration(
     caption_dropout_rate,
     optimizer,
     optimizer_args,
-    noise_offset,
+    noise_offset_type,noise_offset,adaptive_noise_scale,
+    multires_noise_iterations,
+    multires_noise_discount,
     sample_every_n_steps,
     sample_every_n_epochs,
     sample_sampler,
@@ -231,6 +241,8 @@ def open_configuration(
     save_every_n_steps,
     save_last_n_steps,
     save_last_n_steps_state,
+    use_wandb,
+    wandb_api_key,
 ):
     # Get list of function parameters and values
     parameters = list(locals().items())
@@ -262,6 +274,7 @@ def open_configuration(
 
 
 def train_model(
+    headless,
     pretrained_model_name_or_path,
     v2,
     v_parameterization,
@@ -310,6 +323,7 @@ def train_model(
     color_aug,
     model_list,  # Keep this. Yes, it is unused here but required given the common list used
     cache_latents,
+    cache_latents_to_disk,
     use_latent_files,
     keep_tokens,
     persistent_data_loader_workers,
@@ -320,7 +334,9 @@ def train_model(
     caption_dropout_rate,
     optimizer,
     optimizer_args,
-    noise_offset,
+    noise_offset_type,noise_offset,adaptive_noise_scale,
+    multires_noise_iterations,
+    multires_noise_discount,
     sample_every_n_steps,
     sample_every_n_epochs,
     sample_sampler,
@@ -332,14 +348,31 @@ def train_model(
     save_every_n_steps,
     save_last_n_steps,
     save_last_n_steps_state,
+    use_wandb,
+    wandb_api_key,
 ):
-    if check_if_model_exist(output_name, output_dir, save_model_as):
+    headless_bool = True if headless.get('label') == 'True' else False
+
+    if check_if_model_exist(
+        output_name, output_dir, save_model_as, headless_bool
+    ):
         return
 
+    # if float(noise_offset) > 0 and (
+    #     multires_noise_iterations > 0 or multires_noise_discount > 0
+    # ):
+    #     output_message(
+    #         msg="noise offset and multires_noise can't be set at the same time. Only use one or the other.",
+    #         title='Error',
+    #         headless=headless_bool,
+    #     )
+    #     return
+
     if optimizer == 'Adafactor' and lr_warmup != '0':
-        msgbox(
-            "Warning: lr_scheduler is set to 'Adafactor', so 'LR warmup (% of steps)' will be considered 0.",
+        output_message(
+            msg="Warning: lr_scheduler is set to 'Adafactor', so 'LR warmup (% of steps)' will be considered 0.",
             title='Warning',
+            headless=headless_bool,
         )
         lr_warmup = '0'
 
@@ -407,7 +440,12 @@ def train_model(
 
     # calculate max_train_steps
     max_train_steps = int(
-        math.ceil(float(repeats) / int(train_batch_size) * int(epoch))
+        math.ceil(
+            float(repeats)
+            / int(train_batch_size)
+            / int(gradient_accumulation_steps)
+            * int(epoch)
+        )
     )
 
     # Divide by two because flip augmentation create two copied of the source images
@@ -472,6 +510,7 @@ def train_model(
         seed=seed,
         caption_extension=caption_extension,
         cache_latents=cache_latents,
+        cache_latents_to_disk=cache_latents_to_disk,
         optimizer=optimizer,
         optimizer_args=optimizer_args,
     )
@@ -498,13 +537,19 @@ def train_model(
         bucket_reso_steps=bucket_reso_steps,
         caption_dropout_every_n_epochs=caption_dropout_every_n_epochs,
         caption_dropout_rate=caption_dropout_rate,
+        noise_offset_type=noise_offset_type,
         noise_offset=noise_offset,
+        adaptive_noise_scale=adaptive_noise_scale,
+        multires_noise_iterations=multires_noise_iterations,
+        multires_noise_discount=multires_noise_discount,
         additional_parameters=additional_parameters,
         vae_batch_size=vae_batch_size,
         min_snr_gamma=min_snr_gamma,
         save_every_n_steps=save_every_n_steps,
         save_last_n_steps=save_last_n_steps,
         save_last_n_steps_state=save_last_n_steps_state,
+        use_wandb=use_wandb,
+        wandb_api_key=wandb_api_key,
     )
 
     run_cmd += run_cmd_sample(
@@ -538,9 +583,10 @@ def remove_doublequote(file_path):
     return file_path
 
 
-def finetune_tab():
+def finetune_tab(headless=False):
     dummy_db_true = gr.Label(value=True, visible=False)
     dummy_db_false = gr.Label(value=False, visible=False)
+    dummy_headless = gr.Label(value=headless, visible=False)
     gr.Markdown('Train a custom model using kohya finetune python code...')
 
     (
@@ -549,7 +595,7 @@ def finetune_tab():
         button_save_as_config,
         config_file_name,
         button_load_config,
-    ) = gradio_config()
+    ) = gradio_config(headless=headless)
 
     (
         pretrained_model_name_or_path,
@@ -557,7 +603,7 @@ def finetune_tab():
         v_parameterization,
         save_model_as,
         model_list,
-    ) = gradio_source_model()
+    ) = gradio_source_model(headless=headless)
 
     with gr.Tab('Folders'):
         with gr.Row():
@@ -566,7 +612,9 @@ def finetune_tab():
                 placeholder='folder where the training configuration files will be saved',
             )
             train_dir_folder = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             train_dir_folder.click(
                 get_folder_path,
@@ -579,7 +627,9 @@ def finetune_tab():
                 placeholder='folder where the training images are located',
             )
             image_folder_input_folder = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             image_folder_input_folder.click(
                 get_folder_path,
@@ -592,7 +642,9 @@ def finetune_tab():
                 placeholder='folder where the model will be saved',
             )
             output_dir_input_folder = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             output_dir_input_folder.click(
                 get_folder_path,
@@ -605,7 +657,9 @@ def finetune_tab():
                 placeholder='Optional: enable logging and output TensorBoard log to this folder',
             )
             logging_dir_input_folder = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             logging_dir_input_folder.click(
                 get_folder_path,
@@ -688,6 +742,7 @@ def finetune_tab():
             seed,
             caption_extension,
             cache_latents,
+            cache_latents_to_disk,
             optimizer,
             optimizer_args,
         ) = gradio_training(learning_rate_value='1e-5')
@@ -723,14 +778,18 @@ def finetune_tab():
                 bucket_reso_steps,
                 caption_dropout_every_n_epochs,
                 caption_dropout_rate,
-                noise_offset,
+                noise_offset_type,noise_offset,adaptive_noise_scale,
+                multires_noise_iterations,
+                multires_noise_discount,
                 additional_parameters,
                 vae_batch_size,
                 min_snr_gamma,
                 save_every_n_steps,
                 save_last_n_steps,
                 save_last_n_steps_state,
-            ) = gradio_advanced_training()
+                use_wandb,
+                wandb_api_key,
+            ) = gradio_advanced_training(headless=headless)
             color_aug.change(
                 color_aug_changed,
                 inputs=[color_aug],
@@ -808,6 +867,7 @@ def finetune_tab():
         color_aug,
         model_list,
         cache_latents,
+        cache_latents_to_disk,
         use_latent_files,
         keep_tokens,
         persistent_data_loader_workers,
@@ -818,7 +878,9 @@ def finetune_tab():
         caption_dropout_rate,
         optimizer,
         optimizer_args,
-        noise_offset,
+        noise_offset_type,noise_offset,adaptive_noise_scale,
+        multires_noise_iterations,
+        multires_noise_discount,
         sample_every_n_steps,
         sample_every_n_epochs,
         sample_sampler,
@@ -830,9 +892,11 @@ def finetune_tab():
         save_every_n_steps,
         save_last_n_steps,
         save_last_n_steps_state,
+        use_wandb,
+        wandb_api_key,
     ]
 
-    button_run.click(train_model, inputs=settings_list)
+    button_run.click(train_model, inputs=[dummy_headless] + settings_list)
 
     button_open_config.click(
         open_configuration,
@@ -864,40 +928,56 @@ def finetune_tab():
 
 
 def UI(**kwargs):
-
     css = ''
 
+    headless = kwargs.get('headless', False)
+    print(f'headless: {headless}')
+
     if os.path.exists('./style.css'):
         with open(os.path.join('./style.css'), 'r', encoding='utf8') as file:
             print('Load CSS...')
             css += file.read() + '\n'
 
-    interface = gr.Blocks(css=css)
+    interface = gr.Blocks(
+        css=css, title='Kohya_ss GUI', theme=gr.themes.Default()
+    )
 
     with interface:
         with gr.Tab('Finetune'):
-            finetune_tab()
+            finetune_tab(headless=headless)
         with gr.Tab('Utilities'):
-            utilities_tab(enable_dreambooth_tab=False)
+            utilities_tab(enable_dreambooth_tab=False, headless=headless)
 
     # Show the interface
     launch_kwargs = {}
-    if not kwargs.get('username', None) == '':
-        launch_kwargs['auth'] = (
-            kwargs.get('username', None),
-            kwargs.get('password', None),
-        )
-    if kwargs.get('server_port', 0) > 0:
-        launch_kwargs['server_port'] = kwargs.get('server_port', 0)
-    if kwargs.get('inbrowser', False):
-        launch_kwargs['inbrowser'] = kwargs.get('inbrowser', False)
-    print(launch_kwargs)
+    username = kwargs.get('username')
+    password = kwargs.get('password')
+    server_port = kwargs.get('server_port', 0)
+    inbrowser = kwargs.get('inbrowser', False)
+    share = kwargs.get('share', False)
+    server_name = kwargs.get('listen')
+
+    launch_kwargs['server_name'] = server_name
+    if username and password:
+        launch_kwargs['auth'] = (username, password)
+    if server_port > 0:
+        launch_kwargs['server_port'] = server_port
+    if inbrowser:
+        launch_kwargs['inbrowser'] = inbrowser
+    if share:
+        launch_kwargs['share'] = share
     interface.launch(**launch_kwargs)
 
 
 if __name__ == '__main__':
     # torch.cuda.set_per_process_memory_fraction(0.48)
     parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--listen',
+        type=str,
+        default='127.0.0.1',
+        help='IP to listen on for connections to Gradio',
+    )
     parser.add_argument(
         '--username', type=str, default='', help='Username for authentication'
     )
@@ -913,6 +993,12 @@ def UI(**kwargs):
     parser.add_argument(
         '--inbrowser', action='store_true', help='Open in browser'
     )
+    parser.add_argument(
+        '--share', action='store_true', help='Share the gradio UI'
+    )
+    parser.add_argument(
+        '--headless', action='store_true', help='Is the server headless'
+    )
 
     args = parser.parse_args()
 
@@ -921,4 +1007,7 @@ def UI(**kwargs):
         password=args.password,
         inbrowser=args.inbrowser,
         server_port=args.server_port,
+        share=args.share,
+        listen=args.listen,
+        headless=args.headless,
     )
diff --git a/gen_img_diffusers.py b/gen_img_diffusers.py
index 988eae754..27bd7460d 100644
--- a/gen_img_diffusers.py
+++ b/gen_img_diffusers.py
@@ -311,6 +311,7 @@ def backward(ctx, do):
         return dq, dk, dv, None, None, None, None
 
 
+# TODO common train_util.py
 def replace_unet_modules(unet: diffusers.models.unet_2d_condition.UNet2DConditionModel, mem_eff_attn, xformers):
     if mem_eff_attn:
         replace_unet_cross_attn_to_memory_efficient()
@@ -319,7 +320,7 @@ def replace_unet_modules(unet: diffusers.models.unet_2d_condition.UNet2DConditio
 
 
 def replace_unet_cross_attn_to_memory_efficient():
-    print("Replace CrossAttention.forward to use NAI style Hypernetwork and FlashAttention")
+    print("CrossAttention.forward has been replaced to FlashAttention (not xformers) and NAI style Hypernetwork")
     flash_func = FlashAttentionFunction
 
     def forward_flash_attn(self, x, context=None, mask=None):
@@ -359,7 +360,7 @@ def forward_flash_attn(self, x, context=None, mask=None):
 
 
 def replace_unet_cross_attn_to_xformers():
-    print("Replace CrossAttention.forward to use NAI style Hypernetwork and xformers")
+    print("CrossAttention.forward has been replaced to enable xformers and NAI style Hypernetwork")
     try:
         import xformers.ops
     except ImportError:
@@ -401,6 +402,104 @@ def forward_xformers(self, x, context=None, mask=None):
     diffusers.models.attention.CrossAttention.forward = forward_xformers
 
 
+def replace_vae_modules(vae: diffusers.models.AutoencoderKL, mem_eff_attn, xformers):
+    if mem_eff_attn:
+        replace_vae_attn_to_memory_efficient()
+    elif xformers:
+        # とりあえずDiffusersのxformersを使う。AttentionがあるのはMidBlockのみ
+        print("Use Diffusers xformers for VAE")
+        vae.set_use_memory_efficient_attention_xformers(True)
+
+    """
+    # VAEがbfloat16でメモリ消費が大きい問題を解決する
+    upsamplers = []
+    for block in vae.decoder.up_blocks:
+        if block.upsamplers is not None:
+            upsamplers.extend(block.upsamplers)
+
+    def forward_upsample(_self, hidden_states, output_size=None):
+        assert hidden_states.shape[1] == _self.channels
+        if _self.use_conv_transpose:
+            return _self.conv(hidden_states)
+
+        dtype = hidden_states.dtype
+        if dtype == torch.bfloat16:
+            assert output_size is None
+            # repeat_interleaveはすごく遅いが、回数はあまり呼ばれないので許容する
+            hidden_states = hidden_states.repeat_interleave(2, dim=-1)
+            hidden_states = hidden_states.repeat_interleave(2, dim=-2)
+        else:
+            if hidden_states.shape[0] >= 64:
+                hidden_states = hidden_states.contiguous()
+
+            # if `output_size` is passed we force the interpolation output
+            # size and do not make use of `scale_factor=2`
+            if output_size is None:
+                hidden_states = torch.nn.functional.interpolate(hidden_states, scale_factor=2.0, mode="nearest")
+            else:
+                hidden_states = torch.nn.functional.interpolate(hidden_states, size=output_size, mode="nearest")
+
+        if _self.use_conv:
+            if _self.name == "conv":
+                hidden_states = _self.conv(hidden_states)
+            else:
+                hidden_states = _self.Conv2d_0(hidden_states)
+        return hidden_states
+
+    # replace upsamplers
+    for upsampler in upsamplers:
+        # make new scope
+        def make_replacer(upsampler):
+            def forward(hidden_states, output_size=None):
+                return forward_upsample(upsampler, hidden_states, output_size)
+
+            return forward
+
+        upsampler.forward = make_replacer(upsampler)
+"""
+    
+
+def replace_vae_attn_to_memory_efficient():
+    print("AttentionBlock.forward has been replaced to FlashAttention (not xformers)")
+    flash_func = FlashAttentionFunction
+
+    def forward_flash_attn(self, hidden_states):
+        print("forward_flash_attn")
+        q_bucket_size = 512
+        k_bucket_size = 1024
+
+        residual = hidden_states
+        batch, channel, height, width = hidden_states.shape
+
+        # norm
+        hidden_states = self.group_norm(hidden_states)
+
+        hidden_states = hidden_states.view(batch, channel, height * width).transpose(1, 2)
+
+        # proj to q, k, v
+        query_proj = self.query(hidden_states)
+        key_proj = self.key(hidden_states)
+        value_proj = self.value(hidden_states)
+
+        query_proj, key_proj, value_proj = map(
+            lambda t: rearrange(t, "b n (h d) -> b h n d", h=self.num_heads), (query_proj, key_proj, value_proj)
+        )
+
+        out = flash_func.apply(query_proj, key_proj, value_proj, None, False, q_bucket_size, k_bucket_size)
+
+        out = rearrange(out, "b h n d -> b n (h d)")
+
+        # compute next hidden_states
+        hidden_states = self.proj_attn(hidden_states)
+        hidden_states = hidden_states.transpose(-1, -2).reshape(batch, channel, height, width)
+
+        # res connect and rescale
+        hidden_states = (hidden_states + residual) / self.rescale_output_factor
+        return hidden_states
+
+    diffusers.models.attention.AttentionBlock.forward = forward_flash_attn
+
+
 # endregion
 
 # region 画像生成の本体：lpw_stable_diffusion.py （ASL）からコピーして修正
@@ -955,7 +1054,7 @@ def __call__(
                     if torch.cuda.is_available():
                         torch.cuda.empty_cache()
                     init_latents = []
-                    for i in tqdm(range(0, batch_size, vae_batch_size)):
+                    for i in tqdm(range(0, min(batch_size, len(init_image)), vae_batch_size)):
                         init_latent_dist = self.vae.encode(
                             init_image[i : i + vae_batch_size] if vae_batch_size > 1 else init_image[i].unsqueeze(0)
                         ).latent_dist
@@ -2091,7 +2190,7 @@ def main(args):
         dtype = torch.float32
 
     highres_fix = args.highres_fix_scale is not None
-    assert not highres_fix or args.image_path is None, f"highres_fix doesn't work with img2img / highres_fixはimg2imgと同時に使えません"
+    # assert not highres_fix or args.image_path is None, f"highres_fix doesn't work with img2img / highres_fixはimg2imgと同時に使えません"
 
     if args.v_parameterization and not args.v2:
         print("v_parameterization should be with v2 / v1でv_parameterizationを使用することは想定されていません")
@@ -2142,6 +2241,7 @@ def main(args):
     # xformers、Hypernetwork対応
     if not args.diffusers_xformers:
         replace_unet_modules(unet, not args.xformers, args.xformers)
+        replace_vae_modules(vae, not args.xformers, args.xformers)
 
     # tokenizerを読み込む
     print("loading tokenizer")
@@ -2250,7 +2350,27 @@ def __getattr__(self, item):
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # "mps"を考量してない
 
     # custom pipelineをコピったやつを生成する
+    if args.vae_slices:
+        from library.slicing_vae import SlicingAutoencoderKL
+
+        sli_vae = SlicingAutoencoderKL(
+            act_fn="silu",
+            block_out_channels=(128, 256, 512, 512),
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D", "DownEncoderBlock2D"],
+            in_channels=3,
+            latent_channels=4,
+            layers_per_block=2,
+            norm_num_groups=32,
+            out_channels=3,
+            sample_size=512,
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D", "UpDecoderBlock2D"],
+            num_slices=args.vae_slices,
+        )
+        sli_vae.load_state_dict(vae.state_dict())  # vaeのパラメータをコピーする
+        vae = sli_vae
+        del sli_vae
     vae.to(dtype).to(device)
+
     text_encoder.to(dtype).to(device)
     unet.to(dtype).to(device)
     if clip_model is not None:
@@ -2262,6 +2382,8 @@ def __getattr__(self, item):
     if args.network_module:
         networks = []
         network_default_muls = []
+        network_pre_calc = args.network_pre_calc
+
         for i, network_module in enumerate(args.network_module):
             print("import network module:", network_module)
             imported_module = importlib.import_module(network_module)
@@ -2298,11 +2420,11 @@ def __getattr__(self, item):
             if network is None:
                 return
 
-            mergiable = hasattr(network, "merge_to")
-            if args.network_merge and not mergiable:
+            mergeable = network.is_mergeable()
+            if args.network_merge and not mergeable:
                 print("network is not mergiable. ignore merge option.")
 
-            if not args.network_merge or not mergiable:
+            if not args.network_merge or not mergeable:
                 network.apply_to(text_encoder, unet)
                 info = network.load_state_dict(weights_sd, False)  # network.load_weightsを使うようにするとよい
                 print(f"weights are loaded: {info}")
@@ -2311,6 +2433,10 @@ def __getattr__(self, item):
                     network.to(memory_format=torch.channels_last)
                 network.to(dtype).to(device)
 
+                if network_pre_calc:
+                    print("backup original weights")
+                    network.backup_weights()
+
                 networks.append(network)
             else:
                 network.merge_to(text_encoder, unet, weights_sd, dtype, device)
@@ -2586,12 +2712,18 @@ def resize_images(imgs, size):
 
     # 画像サイズにオプション指定があるときはリサイズする
     if args.W is not None and args.H is not None:
+        # highres fix を考慮に入れる
+        w, h = args.W, args.H
+        if highres_fix:
+            w = int(w * args.highres_fix_scale + 0.5)
+            h = int(h * args.highres_fix_scale + 0.5)
+
         if init_images is not None:
-            print(f"resize img2img source images to {args.W}*{args.H}")
-            init_images = resize_images(init_images, (args.W, args.H))
+            print(f"resize img2img source images to {w}*{h}")
+            init_images = resize_images(init_images, (w, h))
         if mask_images is not None:
-            print(f"resize img2img mask images to {args.W}*{args.H}")
-            mask_images = resize_images(mask_images, (args.W, args.H))
+            print(f"resize img2img mask images to {w}*{h}")
+            mask_images = resize_images(mask_images, (w, h))
 
     regional_network = False
     if networks and mask_images:
@@ -2665,13 +2797,15 @@ def process_batch(batch: List[BatchData], highres_fix, highres_1st=False):
                     width_1st = width_1st - width_1st % 32
                     height_1st = height_1st - height_1st % 32
 
+                    strength_1st = ext.strength if args.highres_fix_strength is None else args.highres_fix_strength
+
                     ext_1st = BatchDataExt(
                         width_1st,
                         height_1st,
                         args.highres_fix_steps,
                         ext.scale,
                         ext.negative_scale,
-                        ext.strength,
+                        strength_1st,
                         ext.network_muls,
                         ext.num_sub_prompts,
                     )
@@ -2815,12 +2949,20 @@ def process_batch(batch: List[BatchData], highres_fix, highres_1st=False):
 
             # generate
             if networks:
+                # 追加ネットワークの処理
                 shared = {}
                 for n, m in zip(networks, network_muls if network_muls else network_default_muls):
                     n.set_multiplier(m)
                     if regional_network:
                         n.set_current_generation(batch_size, num_sub_prompts, width, height, shared)
 
+                if not regional_network and network_pre_calc:
+                    for n in networks:
+                        n.restore_weights()
+                    for n in networks:
+                        n.pre_calculation()
+                    print("pre-calculation... done")
+
             images = pipe(
                 prompts,
                 negative_prompts,
@@ -3018,14 +3160,16 @@ def process_batch(batch: List[BatchData], highres_fix, highres_1st=False):
                 if init_images is not None:
                     init_image = init_images[global_step % len(init_images)]
 
+                    # img2imgの場合は、基本的に元画像のサイズで生成する。highres fixの場合はargs.W, args.Hとscaleに従いリサイズ済みなので無視する
                     # 32単位に丸めたやつにresizeされるので踏襲する
-                    width, height = init_image.size
-                    width = width - width % 32
-                    height = height - height % 32
-                    if width != init_image.size[0] or height != init_image.size[1]:
-                        print(
-                            f"img2img image size is not divisible by 32 so aspect ratio is changed / img2imgの画像サイズが32で割り切れないためリサイズされます。画像が歪みます"
-                        )
+                    if not highres_fix:
+                        width, height = init_image.size
+                        width = width - width % 32
+                        height = height - height % 32
+                        if width != init_image.size[0] or height != init_image.size[1]:
+                            print(
+                                f"img2img image size is not divisible by 32 so aspect ratio is changed / img2imgの画像サイズが32で割り切れないためリサイズされます。画像が歪みます"
+                            )
 
                 if mask_images is not None:
                     mask_image = mask_images[global_step % len(mask_images)]
@@ -3127,6 +3271,12 @@ def setup_parser() -> argparse.ArgumentParser:
         default=None,
         help="batch size for VAE, < 1.0 for ratio / VAE処理時のバッチサイズ、1未満の値の場合は通常バッチサイズの比率",
     )
+    parser.add_argument(
+        "--vae_slices",
+        type=int,
+        default=None,
+        help="number of slices to split image into for VAE to reduce VRAM usage, None for no splitting (default), slower if specified. 16 or 32 recommended / VAE処理時にVRAM使用量削減のため画像を分割するスライス数、Noneの場合は分割しない（デフォルト）、指定すると遅くなる。16か32程度を推奨",
+    )
     parser.add_argument("--steps", type=int, default=50, help="number of ddim sampling steps / サンプリングステップ数")
     parser.add_argument(
         "--sampler",
@@ -3204,6 +3354,9 @@ def setup_parser() -> argparse.ArgumentParser:
     )
     parser.add_argument("--network_show_meta", action="store_true", help="show metadata of network model / ネットワークモデルのメタデータを表示する")
     parser.add_argument("--network_merge", action="store_true", help="merge network weights to original model / ネットワークの重みをマージする")
+    parser.add_argument(
+        "--network_pre_calc", action="store_true", help="pre-calculate network for generation / ネットワークのあらかじめ計算して生成する"
+    )
     parser.add_argument(
         "--textual_inversion_embeddings",
         type=str,
@@ -3261,6 +3414,12 @@ def setup_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--highres_fix_steps", type=int, default=28, help="1st stage steps for highres fix / highres fixの最初のステージのステップ数"
     )
+    parser.add_argument(
+        "--highres_fix_strength",
+        type=float,
+        default=None,
+        help="1st stage img2img strength for highres fix / highres fixの最初のステージのimg2img時のstrength、省略時はstrengthと同じ",
+    )
     parser.add_argument(
         "--highres_fix_save_1st", action="store_true", help="save 1st stage images for highres fix / highres fixの最初のステージの画像を保存する"
     )
diff --git a/gui.sh b/gui.sh
index cc4f9dc57..857d98e20 100755
--- a/gui.sh
+++ b/gui.sh
@@ -1,5 +1,12 @@
 #!/usr/bin/env bash
 
+# If it is run with the sudo command, get the complete LD_LIBRARY_PATH environment variable of the system and assign it to the current environment,
+# because it will be used later.
+if [ -n "$SUDO_USER" ] || [ -n "$SUDO_COMMAND" ] ; then
+    echo "The sudo command resets the non-essential environment variables, we keep the LD_LIBRARY_PATH variable."
+    export LD_LIBRARY_PATH=$(sudo -i printenv LD_LIBRARY_PATH)
+fi
+
 # This gets the directory the script is run from so pathing can work relative to the script where needed.
 SCRIPT_DIR=$(cd -- "$(dirname -- "$0")" && pwd)
 
diff --git a/kohya_gui.py b/kohya_gui.py
index 088f6dade..abeae4174 100644
--- a/kohya_gui.py
+++ b/kohya_gui.py
@@ -83,6 +83,9 @@ def setup_logging(clean=False):
 def UI(**kwargs):
     css = ''
 
+    headless = kwargs.get('headless', False)
+    print(f'headless: {headless}')
+
     if os.path.exists('./style.css'):
         with open(os.path.join('./style.css'), 'r', encoding='utf8') as file:
             log.info('Load CSS...')
@@ -99,13 +102,13 @@ def UI(**kwargs):
                 reg_data_dir_input,
                 output_dir_input,
                 logging_dir_input,
-            ) = dreambooth_tab()
+            ) = dreambooth_tab(headless=headless)
         with gr.Tab('Dreambooth LoRA'):
-            lora_tab()
+            lora_tab(headless=headless)
         with gr.Tab('Dreambooth TI'):
-            ti_tab()
+            ti_tab(headless=headless)
         with gr.Tab('Finetune'):
-            finetune_tab()
+            finetune_tab(headless=headless)
         with gr.Tab('Utilities'):
             utilities_tab(
                 train_data_dir_input=train_data_dir_input,
@@ -113,13 +116,15 @@ def UI(**kwargs):
                 output_dir_input=output_dir_input,
                 logging_dir_input=logging_dir_input,
                 enable_copy_info_button=True,
+                headless=headless,
             )
-            gradio_extract_dylora_tab()
-            gradio_extract_lora_tab()
-            gradio_extract_lycoris_locon_tab()
-            gradio_merge_lora_tab()
-            gradio_merge_lycoris_tab()
-            gradio_resize_lora_tab()
+            with gr.Tab('LoRA'):
+                gradio_extract_dylora_tab(headless=headless)
+                gradio_extract_lora_tab(headless=headless)
+                gradio_extract_lycoris_locon_tab(headless=headless)
+                gradio_merge_lora_tab(headless=headless)
+                gradio_merge_lycoris_tab(headless=headless)
+                gradio_resize_lora_tab(headless=headless)
 
     # Show the interface
     launch_kwargs = {}
@@ -169,6 +174,9 @@ def UI(**kwargs):
     parser.add_argument(
         '--share', action='store_true', help='Share the gradio UI'
     )
+    parser.add_argument(
+        '--headless', action='store_true', help='Is the server headless'
+    )
 
     args = parser.parse_args()
 
@@ -179,4 +187,5 @@ def UI(**kwargs):
         server_port=args.server_port,
         share=args.share,
         listen=args.listen,
+        headless=args.headless,
     )
diff --git a/library/basic_caption_gui.py b/library/basic_caption_gui.py
index b2d208d1e..20b890a7a 100644
--- a/library/basic_caption_gui.py
+++ b/library/basic_caption_gui.py
@@ -68,7 +68,7 @@ def caption_images(
 
 
 # Gradio UI
-def gradio_basic_caption_gui_tab():
+def gradio_basic_caption_gui_tab(headless=False):
     with gr.Tab('Basic Captioning'):
         gr.Markdown(
             'This utility will allow the creation of simple caption files for each image in a folder.'
@@ -79,7 +79,9 @@ def gradio_basic_caption_gui_tab():
                 placeholder='Directory containing the images to caption',
                 interactive=True,
             )
-            folder_button = gr.Button('📂', elem_id='open_folder_small')
+            folder_button = gr.Button(
+                '📂', elem_id='open_folder_small', visible=(not headless)
+            )
             folder_button.click(
                 get_folder_path,
                 outputs=images_dir,
diff --git a/library/blip_caption_gui.py b/library/blip_caption_gui.py
index 2e0081ddc..7a5766cc7 100644
--- a/library/blip_caption_gui.py
+++ b/library/blip_caption_gui.py
@@ -71,7 +71,7 @@ def caption_images(
 ###
 
 
-def gradio_blip_caption_gui_tab():
+def gradio_blip_caption_gui_tab(headless=False):
     with gr.Tab('BLIP Captioning'):
         gr.Markdown(
             'This utility will use BLIP to caption files for each images in a folder.'
@@ -83,7 +83,7 @@ def gradio_blip_caption_gui_tab():
                 interactive=True,
             )
             button_train_data_dir_input = gr.Button(
-                '📂', elem_id='open_folder_small'
+                '📂', elem_id='open_folder_small', visible=(not headless)
             )
             button_train_data_dir_input.click(
                 get_folder_path,
diff --git a/library/common_gui.py b/library/common_gui.py
index 284f8ec56..1b2e88ded 100644
--- a/library/common_gui.py
+++ b/library/common_gui.py
@@ -35,7 +35,15 @@
 ENV_EXCLUSION = ['COLAB_GPU', 'RUNPOD_POD_ID']
 
 
-def check_if_model_exist(output_name, output_dir, save_model_as):
+def check_if_model_exist(
+    output_name, output_dir, save_model_as, headless=False
+):
+    if headless:
+        print(
+            'Headless mode, skipping verification if model already exist... if model already exist it will be overwritten...'
+        )
+        return False
+
     if save_model_as in ['diffusers', 'diffusers_safetendors']:
         ckpt_folder = os.path.join(output_dir, output_name)
         if os.path.isdir(ckpt_folder):
@@ -63,6 +71,13 @@ def check_if_model_exist(output_name, output_dir, save_model_as):
     return False
 
 
+def output_message(msg='', title='', headless=False):
+    if headless:
+        print(msg)
+    else:
+        msgbox(msg=msg, title=title)
+
+
 def update_my_data(my_data):
     # Update the optimizer based on the use_8bit_adam flag
     use_8bit_adam = my_data.get('use_8bit_adam', False)
@@ -70,17 +85,30 @@ def update_my_data(my_data):
 
     # Update model_list to custom if empty or pretrained_model_name_or_path is not a preset model
     model_list = my_data.get('model_list', [])
-    pretrained_model_name_or_path = my_data.get('pretrained_model_name_or_path', '')
-    if not model_list or pretrained_model_name_or_path not in ALL_PRESET_MODELS:
+    pretrained_model_name_or_path = my_data.get(
+        'pretrained_model_name_or_path', ''
+    )
+    if (
+        not model_list
+        or pretrained_model_name_or_path not in ALL_PRESET_MODELS
+    ):
         my_data['model_list'] = 'custom'
 
-    # Convert epoch and save_every_n_epochs values to int if they are strings
-    for key in ['epoch', 'save_every_n_epochs']:
-        value = my_data.get(key, -1)
-        if isinstance(value, str) and value.isdigit():
+    # Convert values to int if they are strings
+    for key in ['epoch', 'save_every_n_epochs', 'lr_warmup']:
+        value = my_data.get(key, 0)
+        if isinstance(value, str) and value.strip().isdigit():
             my_data[key] = int(value)
         elif not value:
-            my_data[key] = -1
+            my_data[key] = 0
+
+    # Convert values to float if they are strings
+    for key in ['noise_offset', 'learning_rate', 'text_encoder_lr', 'unet_lr']:
+        value = my_data.get(key, 0)
+        if isinstance(value, str) and value.strip().isdigit():
+            my_data[key] = float(value)
+        elif not value:
+            my_data[key] = 0
 
     # Update LoRA_type if it is set to LoCon
     if my_data.get('LoRA_type', 'Standard') == 'LoCon':
@@ -88,12 +116,9 @@ def update_my_data(my_data):
 
     # Update model save choices due to changes for LoRA and TI training
     if (
-        (my_data.get('LoRA_type') or my_data.get('num_vectors_per_token'))
-        and my_data.get('save_model_as') not in ['safetensors', 'ckpt']
-    ):
-        message = (
-            'Updating save_model_as to safetensors because the current value in the config file is no longer applicable to {}'
-        )
+        my_data.get('LoRA_type') or my_data.get('num_vectors_per_token')
+    ) and my_data.get('save_model_as') not in ['safetensors', 'ckpt']:
+        message = 'Updating save_model_as to safetensors because the current value in the config file is no longer applicable to {}'
         if my_data.get('LoRA_type'):
             print(message.format('LoRA'))
         if my_data.get('num_vectors_per_token'):
@@ -121,7 +146,10 @@ def get_dir_and_file(file_path):
 def get_file_path(
     file_path='', default_extension='.json', extension_name='Config files'
 ):
-    if not any(var in os.environ for var in ENV_EXCLUSION) and sys.platform != 'darwin':
+    if (
+        not any(var in os.environ for var in ENV_EXCLUSION)
+        and sys.platform != 'darwin'
+    ):
         current_file_path = file_path
         # print(f'current file path: {current_file_path}')
 
@@ -156,7 +184,10 @@ def get_file_path(
 
 
 def get_any_file_path(file_path=''):
-    if not any(var in os.environ for var in ENV_EXCLUSION) and sys.platform != 'darwin':
+    if (
+        not any(var in os.environ for var in ENV_EXCLUSION)
+        and sys.platform != 'darwin'
+    ):
         current_file_path = file_path
         # print(f'current file path: {current_file_path}')
 
@@ -198,7 +229,10 @@ def remove_doublequote(file_path):
 
 
 def get_folder_path(folder_path=''):
-    if not any(var in os.environ for var in ENV_EXCLUSION) and sys.platform != 'darwin':
+    if (
+        not any(var in os.environ for var in ENV_EXCLUSION)
+        and sys.platform != 'darwin'
+    ):
         current_folder_path = folder_path
 
         initial_dir, initial_file = get_dir_and_file(folder_path)
@@ -218,7 +252,10 @@ def get_folder_path(folder_path=''):
 def get_saveasfile_path(
     file_path='', defaultextension='.json', extension_name='Config files'
 ):
-    if not any(var in os.environ for var in ENV_EXCLUSION) and sys.platform != 'darwin':
+    if (
+        not any(var in os.environ for var in ENV_EXCLUSION)
+        and sys.platform != 'darwin'
+    ):
         current_file_path = file_path
         # print(f'current file path: {current_file_path}')
 
@@ -254,7 +291,10 @@ def get_saveasfile_path(
 def get_saveasfilename_path(
     file_path='', extensions='*', extension_name='Config files'
 ):
-    if not any(var in os.environ for var in ENV_EXCLUSION) and sys.platform != 'darwin':
+    if (
+        not any(var in os.environ for var in ENV_EXCLUSION)
+        and sys.platform != 'darwin'
+    ):
         current_file_path = file_path
         # print(f'current file path: {current_file_path}')
 
@@ -264,7 +304,10 @@ def get_saveasfilename_path(
         root.wm_attributes('-topmost', 1)
         root.withdraw()
         save_file_path = filedialog.asksaveasfilename(
-            filetypes=((f'{extension_name}', f'{extensions}'), ('All files', '*')),
+            filetypes=(
+                (f'{extension_name}', f'{extensions}'),
+                ('All files', '*'),
+            ),
             defaultextension=extensions,
             initialdir=initial_dir,
             initialfile=initial_file,
@@ -310,11 +353,11 @@ def add_pre_postfix(
         caption_file_path = os.path.join(folder, caption_file_name)
 
         if not os.path.exists(caption_file_path):
-            with open(caption_file_path, 'w') as f:
+            with open(caption_file_path, 'w', encoding='utf8') as f:
                 separator = ' ' if prefix and postfix else ''
                 f.write(f'{prefix}{separator}{postfix}')
         else:
-            with open(caption_file_path, 'r+') as f:
+            with open(caption_file_path, 'r+', encoding='utf8') as f:
                 content = f.read()
                 content = content.rstrip()
                 f.seek(0, 0)
@@ -501,13 +544,17 @@ def set_model_list(
 ###
 
 
-def gradio_config():
+def gradio_config(headless=False):
     with gr.Accordion('Configuration file', open=False):
         with gr.Row():
-            button_open_config = gr.Button('Open 📂', elem_id='open_folder')
-            button_save_config = gr.Button('Save 💾', elem_id='open_folder')
+            button_open_config = gr.Button(
+                'Open 📂', elem_id='open_folder', visible=(not headless)
+            )
+            button_save_config = gr.Button(
+                'Save 💾', elem_id='open_folder', visible=(not headless)
+            )
             button_save_as_config = gr.Button(
-                'Save as... 💾', elem_id='open_folder'
+                'Save as... 💾', elem_id='open_folder', visible=(not headless)
             )
             config_file_name = gr.Textbox(
                 label='',
@@ -538,13 +585,16 @@ def get_pretrained_model_name_or_path_file(
     set_model_list(model_list, pretrained_model_name_or_path)
 
 
-def gradio_source_model(save_model_as_choices = [
-                    'same as source model',
-                    'ckpt',
-                    'diffusers',
-                    'diffusers_safetensors',
-                    'safetensors',
-                ]):
+def gradio_source_model(
+    save_model_as_choices=[
+        'same as source model',
+        'ckpt',
+        'diffusers',
+        'diffusers_safetensors',
+        'safetensors',
+    ],
+    headless=False,
+):
     with gr.Tab('Source model'):
         # Define the input elements
         with gr.Row():
@@ -554,7 +604,9 @@ def gradio_source_model(save_model_as_choices = [
                 value='runwayml/stable-diffusion-v1-5',
             )
             pretrained_model_name_or_path_file = gr.Button(
-                document_symbol, elem_id='open_folder_small'
+                document_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             pretrained_model_name_or_path_file.click(
                 get_any_file_path,
@@ -563,7 +615,9 @@ def gradio_source_model(save_model_as_choices = [
                 show_progress=False,
             )
             pretrained_model_name_or_path_folder = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             pretrained_model_name_or_path_folder.click(
                 get_folder_path,
@@ -696,9 +750,12 @@ def gradio_training(
             value=2,
         )
         seed = gr.Textbox(label='Seed', placeholder='(Optional) eg:1234')
-        cache_latents = gr.Checkbox(label='Cache latent', value=True)
+        cache_latents = gr.Checkbox(label='Cache latents', value=True)
+        cache_latents_to_disk = gr.Checkbox(
+            label='Cache latents to disk', value=False
+        )
     with gr.Row():
-        learning_rate = gr.Textbox(
+        learning_rate = gr.Number(
             label='Learning rate', value=learning_rate_value
         )
         lr_scheduler = gr.Dropdown(
@@ -714,8 +771,12 @@ def gradio_training(
             ],
             value=lr_scheduler_value,
         )
-        lr_warmup = gr.Textbox(
-            label='LR warmup (% of steps)', value=lr_warmup_value
+        lr_warmup = gr.Slider(
+            label='LR warmup (% of steps)',
+            value=lr_warmup_value,
+            minimum=0,
+            maximum=100,
+            step=1,
         )
         optimizer = gr.Dropdown(
             label='Optimizer',
@@ -724,7 +785,15 @@ def gradio_training(
                 'AdamW8bit',
                 'Adafactor',
                 'DAdaptation',
+                'DAdaptAdaGrad',
+                'DAdaptAdam',
+                'DAdaptAdan',
+                'DAdaptAdanIP',
+                'DAdaptAdamPreprint',
+                'DAdaptLion',
+                'DAdaptSGD',
                 'Lion',
+                'Lion8bit',
                 'SGDNesterov',
                 'SGDNesterov8bit',
             ],
@@ -749,55 +818,116 @@ def gradio_training(
         seed,
         caption_extension,
         cache_latents,
+        cache_latents_to_disk,
         optimizer,
         optimizer_args,
     )
 
+def get_int_or_default(kwargs, key, default_value=0):
+    value = kwargs.get(key, default_value)
+    if isinstance(value, int):
+        return value
+    elif isinstance(value, str):
+        return int(value)
+    elif isinstance(value, float):
+        return int(value)
+    else:
+        print(f'{key} is not an int, float or a string, setting value to {default_value}')
+        return default_value
+    
+def get_float_or_default(kwargs, key, default_value=0.0):
+    value = kwargs.get(key, default_value)
+    if isinstance(value, float):
+        return value
+    elif isinstance(value, int):
+        return float(value)
+    elif isinstance(value, str):
+        return float(value)
+    else:
+        print(f'{key} is not an int, float or a string, setting value to {default_value}')
+        return default_value
+
+def get_str_or_default(kwargs, key, default_value=""):
+    value = kwargs.get(key, default_value)
+    if isinstance(value, str):
+        return value
+    elif isinstance(value, int):
+        return str(value)
+    elif isinstance(value, str):
+        return str(value)
+    else:
+        return default_value
 
 def run_cmd_training(**kwargs):
-    options = [
-        f' --learning_rate="{kwargs.get("learning_rate", "")}"'
-        if kwargs.get('learning_rate')
-        else '',
-        f' --lr_scheduler="{kwargs.get("lr_scheduler", "")}"'
-        if kwargs.get('lr_scheduler')
-        else '',
-        f' --lr_warmup_steps="{kwargs.get("lr_warmup_steps", "")}"'
-        if kwargs.get('lr_warmup_steps')
-        else '',
-        f' --train_batch_size="{kwargs.get("train_batch_size", "")}"'
-        if kwargs.get('train_batch_size')
-        else '',
-        f' --max_train_steps="{kwargs.get("max_train_steps", "")}"'
-        if kwargs.get('max_train_steps')
-        else '',
-        f' --save_every_n_epochs="{int(kwargs.get("save_every_n_epochs", 1))}"'
-        if int(kwargs.get('save_every_n_epochs'))
-        else '',
-        f' --mixed_precision="{kwargs.get("mixed_precision", "")}"'
-        if kwargs.get('mixed_precision')
-        else '',
-        f' --save_precision="{kwargs.get("save_precision", "")}"'
-        if kwargs.get('save_precision')
-        else '',
-        f' --seed="{kwargs.get("seed", "")}"'
-        if kwargs.get('seed') != ''
-        else '',
-        f' --caption_extension="{kwargs.get("caption_extension", "")}"'
-        if kwargs.get('caption_extension')
-        else '',
-        ' --cache_latents' if kwargs.get('cache_latents') else '',
-        # ' --use_lion_optimizer' if kwargs.get('optimizer') == 'Lion' else '',
-        f' --optimizer_type="{kwargs.get("optimizer", "AdamW")}"',
-        f' --optimizer_args {kwargs.get("optimizer_args", "")}'
-        if not kwargs.get('optimizer_args') == ''
-        else '',
-    ]
-    run_cmd = ''.join(options)
+    run_cmd = ''
+    
+    learning_rate = kwargs.get("learning_rate", "")
+    if learning_rate:
+        run_cmd += f' --learning_rate="{learning_rate}"'
+    
+    lr_scheduler = kwargs.get("lr_scheduler", "")
+    if lr_scheduler:
+        run_cmd += f' --lr_scheduler="{lr_scheduler}"'
+    
+    lr_warmup_steps = kwargs.get("lr_warmup_steps", "")
+    if lr_warmup_steps:
+        if lr_scheduler == 'constant':
+            print('Can\'t use LR warmup with LR Scheduler constant... ignoring...')
+        else:
+            run_cmd += f' --lr_warmup_steps="{lr_warmup_steps}"'
+    
+    train_batch_size = kwargs.get("train_batch_size", "")
+    if train_batch_size:
+        run_cmd += f' --train_batch_size="{train_batch_size}"'
+    
+    max_train_steps = kwargs.get("max_train_steps", "")
+    if max_train_steps:
+        run_cmd += f' --max_train_steps="{max_train_steps}"'
+    
+    save_every_n_epochs = kwargs.get("save_every_n_epochs")
+    if save_every_n_epochs:
+        run_cmd += f' --save_every_n_epochs="{int(save_every_n_epochs)}"'
+    
+    mixed_precision = kwargs.get("mixed_precision", "")
+    if mixed_precision:
+        run_cmd += f' --mixed_precision="{mixed_precision}"'
+    
+    save_precision = kwargs.get("save_precision", "")
+    if save_precision:
+        run_cmd += f' --save_precision="{save_precision}"'
+    
+    seed = kwargs.get("seed", "")
+    if seed != '':
+        run_cmd += f' --seed="{seed}"'
+    
+    caption_extension = kwargs.get("caption_extension", "")
+    if caption_extension:
+        run_cmd += f' --caption_extension="{caption_extension}"'
+    
+    cache_latents = kwargs.get('cache_latents')
+    if cache_latents:
+        run_cmd += ' --cache_latents'
+    
+    cache_latents_to_disk = kwargs.get('cache_latents_to_disk')
+    if cache_latents_to_disk:
+        run_cmd += ' --cache_latents_to_disk'
+    
+    optimizer_type = kwargs.get("optimizer", "AdamW")
+    run_cmd += f' --optimizer_type="{optimizer_type}"'
+    
+    optimizer_args = kwargs.get("optimizer_args", "")
+    if optimizer_args != '':
+        run_cmd += f' --optimizer_args {optimizer_args}'
+    
     return run_cmd
 
 
-def gradio_advanced_training():
+def gradio_advanced_training(headless=False):
+    def noise_offset_type_change(noise_offset_type):
+        if noise_offset_type == 'Original':
+            return (gr.Group.update(visible=True), gr.Group.update(visible=False))
+        else:
+            return (gr.Group.update(visible=False), gr.Group.update(visible=True))
     with gr.Row():
         additional_parameters = gr.Textbox(
             label='Additional parameters',
@@ -805,13 +935,22 @@ def gradio_advanced_training():
         )
     with gr.Row():
         save_every_n_steps = gr.Number(
-            label='Save every N steps', value=0, precision=0, info='(Optional) The model is saved every specified steps'
+            label='Save every N steps',
+            value=0,
+            precision=0,
+            info='(Optional) The model is saved every specified steps',
         )
         save_last_n_steps = gr.Number(
-            label='Save last N steps', value=0, precision=0, info='(Optional) Save only the specified number of models (old models will be deleted)'
+            label='Save last N steps',
+            value=0,
+            precision=0,
+            info='(Optional) Save only the specified number of models (old models will be deleted)',
         )
         save_last_n_steps_state = gr.Number(
-            label='Save last N steps', value=0, precision=0, info='(Optional) Save only the specified number of states (old models will be deleted)'
+            label='Save last N steps',
+            value=0,
+            precision=0,
+            info='(Optional) Save only the specified number of states (old models will be deleted)',
         )
     with gr.Row():
         keep_tokens = gr.Slider(
@@ -851,7 +990,9 @@ def gradio_advanced_training():
         xformers = gr.Checkbox(label='Use xformers', value=True)
         color_aug = gr.Checkbox(label='Color augmentation', value=False)
         flip_aug = gr.Checkbox(label='Flip augmentation', value=False)
-        min_snr_gamma = gr.Slider(label='Min SNR gamma', value = 0, minimum=0, maximum=20, step=1)
+        min_snr_gamma = gr.Slider(
+            label='Min SNR gamma', value=0, minimum=0, maximum=20, step=1
+        )
     with gr.Row():
         bucket_no_upscale = gr.Checkbox(
             label="Don't upscale bucket resolution", value=True
@@ -862,10 +1003,55 @@ def gradio_advanced_training():
         random_crop = gr.Checkbox(
             label='Random crop instead of center crop', value=False
         )
-        noise_offset = gr.Textbox(
-            label='Noise offset (0 - 1)', placeholder='(Oprional) eg: 0.1'
+    
+    with gr.Row():
+        noise_offset_type = gr.Dropdown(
+            label='Noise offset type',
+            choices=[
+                'Original',
+                'Multires',
+            ],
+            value='Original',
+        )
+        with gr.Row(visible=True) as noise_offset_original:
+            noise_offset = gr.Slider(
+                label='Noise offset',
+                value=0,
+                minimum=0,
+                maximum=1,
+                step=0.01,
+                info='recommended values are 0.05 - 0.15',
+            )
+            adaptive_noise_scale = gr.Slider(
+                label='Adaptive noise scale',
+                value=0,
+                minimum=-1,
+                maximum=1,
+                step=0.001,
+                info='(Experimental, Optional) Since the latent is close to a normal distribution, it may be a good idea to specify a value around 1/10 the noise offset.',
+            )
+        with gr.Row(visible=False) as noise_offset_multires:
+            multires_noise_iterations = gr.Slider(
+                label='Multires noise iterations',
+                value=0,
+                minimum=0,
+                maximum=64,
+                step=1,
+                info='enable multires noise (recommended values are 6-10)',
+            )
+            multires_noise_discount = gr.Slider(
+                label='Multires noise discount',
+                value=0,
+                minimum=0,
+                maximum=1,
+                step=0.01,
+                info='recommended values are 0.8. For LoRAs with small datasets, 0.1-0.3',
+            )
+        noise_offset_type.change(
+            noise_offset_type_change,
+            inputs=[noise_offset_type],
+            outputs=[noise_offset_original, noise_offset_multires]
         )
-
     with gr.Row():
         caption_dropout_every_n_epochs = gr.Number(
             label='Dropout caption every n epochs', value=0
@@ -874,11 +1060,7 @@ def gradio_advanced_training():
             label='Rate of caption dropout', value=0, minimum=0, maximum=1
         )
         vae_batch_size = gr.Slider(
-            label='VAE batch size',
-            minimum=0,
-            maximum=32,
-            value=0,
-            step=1
+            label='VAE batch size', minimum=0, maximum=32, value=0, step=1
         )
     with gr.Row():
         save_state = gr.Checkbox(label='Save training state', value=False)
@@ -886,7 +1068,9 @@ def gradio_advanced_training():
             label='Resume from saved training state',
             placeholder='path to "last-state" state folder to resume from',
         )
-        resume_button = gr.Button('📂', elem_id='open_folder_small')
+        resume_button = gr.Button(
+            '📂', elem_id='open_folder_small', visible=(not headless)
+        )
         resume_button.click(
             get_folder_path,
             outputs=resume,
@@ -899,7 +1083,19 @@ def gradio_advanced_training():
         max_data_loader_n_workers = gr.Textbox(
             label='Max num workers for DataLoader',
             placeholder='(Optional) Override number of epoch. Default: 8',
-            value="0",
+            value='0',
+        )
+    with gr.Row():
+        wandb_api_key = gr.Textbox(
+            label='WANDB API Key',
+            value='',
+            placeholder='(Optional)',
+            info='Users can obtain and/or generate an api key in the their user settings on the website: https://wandb.ai/login',
+        )
+        use_wandb = gr.Checkbox(
+            label='WANDB Logging',
+            value=False,
+            info='If unchecked, tensorboard will be used as the default for logging.',
         )
     return (
         # use_8bit_adam,
@@ -923,79 +1119,152 @@ def gradio_advanced_training():
         bucket_reso_steps,
         caption_dropout_every_n_epochs,
         caption_dropout_rate,
+        noise_offset_type,
         noise_offset,
+        adaptive_noise_scale,
+        multires_noise_iterations,
+        multires_noise_discount,
         additional_parameters,
         vae_batch_size,
         min_snr_gamma,
         save_every_n_steps,
         save_last_n_steps,
         save_last_n_steps_state,
+        use_wandb,
+        wandb_api_key,
     )
 
 
 def run_cmd_advanced_training(**kwargs):
-    options = [
-        f' --max_train_epochs="{kwargs.get("max_train_epochs", "")}"'
-        if kwargs.get('max_train_epochs')
-        else '',
-        f' --max_data_loader_n_workers="{kwargs.get("max_data_loader_n_workers", "")}"'
-        if kwargs.get('max_data_loader_n_workers')
-        else '',
-        f' --max_token_length={kwargs.get("max_token_length", "")}'
-        if int(kwargs.get('max_token_length', 75)) > 75
-        else '',
-        f' --clip_skip={kwargs.get("clip_skip", "")}'
-        if int(kwargs.get('clip_skip', 1)) > 1
-        else '',
-        f' --resume="{kwargs.get("resume", "")}"'
-        if kwargs.get('resume')
-        else '',
-        f' --keep_tokens="{kwargs.get("keep_tokens", "")}"'
-        if int(kwargs.get('keep_tokens', 0)) > 0
-        else '',
-        f' --caption_dropout_every_n_epochs="{int(kwargs.get("caption_dropout_every_n_epochs", 0))}"'
-        if int(kwargs.get('caption_dropout_every_n_epochs', 0)) > 0
-        else '',
-        f' --caption_dropout_rate="{float(kwargs.get("caption_dropout_rate", 0))}"'
-        if float(kwargs.get('caption_dropout_rate', 0)) > 0
-        else '',
-        f' --vae_batch_size="{kwargs.get("vae_batch_size", 0)}"'
-        if int(kwargs.get('vae_batch_size', 0)) > 0
-        else '',
-        f' --bucket_reso_steps={int(kwargs.get("bucket_reso_steps", 1))}'
-        if int(kwargs.get('bucket_reso_steps', 64)) >= 1
-        else '',
-        f' --save_every_n_steps="{int(kwargs.get("save_every_n_steps", 0))}"'
-        if int(kwargs.get('save_every_n_steps')) > 0
-        else '',
-        f' --save_last_n_steps="{int(kwargs.get("save_last_n_steps", 0))}"'
-        if int(kwargs.get('save_last_n_steps')) > 0
-        else '',
-        f' --save_last_n_steps_state="{int(kwargs.get("save_last_n_steps_state", 0))}"'
-        if int(kwargs.get('save_last_n_steps_state')) > 0
-        else '',
-        f' --min_snr_gamma={int(kwargs.get("min_snr_gamma", 0))}'
-        if int(kwargs.get('min_snr_gamma', 0)) >= 1
-        else '',
-        ' --save_state' if kwargs.get('save_state') else '',
-        ' --mem_eff_attn' if kwargs.get('mem_eff_attn') else '',
-        ' --color_aug' if kwargs.get('color_aug') else '',
-        ' --flip_aug' if kwargs.get('flip_aug') else '',
-        ' --shuffle_caption' if kwargs.get('shuffle_caption') else '',
-        ' --gradient_checkpointing' if kwargs.get('gradient_checkpointing')
-        else '',
-        ' --full_fp16' if kwargs.get('full_fp16') else '',
-        ' --xformers' if kwargs.get('xformers') else '',
-        # ' --use_8bit_adam' if kwargs.get('use_8bit_adam') else '',
-        ' --persistent_data_loader_workers'
-        if kwargs.get('persistent_data_loader_workers')
-        else '',
-        ' --bucket_no_upscale' if kwargs.get('bucket_no_upscale') else '',
-        ' --random_crop' if kwargs.get('random_crop') else '',
-        f' --noise_offset={float(kwargs.get("noise_offset", 0))}'
-        if not kwargs.get('noise_offset', '') == ''
-        else '',
-        f' {kwargs.get("additional_parameters", "")}',
-    ]
-    run_cmd = ''.join(options)
+    run_cmd = ''
+    
+    max_train_epochs = kwargs.get("max_train_epochs", "")
+    if max_train_epochs:
+        run_cmd += f' --max_train_epochs={max_train_epochs}'
+        
+    max_data_loader_n_workers = kwargs.get("max_data_loader_n_workers", "")
+    if max_data_loader_n_workers:
+        run_cmd += f' --max_data_loader_n_workers="{max_data_loader_n_workers}"'
+    
+    max_token_length = int(kwargs.get("max_token_length", 75))
+    if max_token_length > 75:
+        run_cmd += f' --max_token_length={max_token_length}'
+        
+    clip_skip = int(kwargs.get("clip_skip", 1))
+    if clip_skip > 1:
+        run_cmd += f' --clip_skip={clip_skip}'
+        
+    resume = kwargs.get("resume", "")
+    if resume:
+        run_cmd += f' --resume="{resume}"'
+        
+    keep_tokens = int(kwargs.get("keep_tokens", 0))
+    if keep_tokens > 0:
+        run_cmd += f' --keep_tokens="{keep_tokens}"'
+        
+    caption_dropout_every_n_epochs = int(kwargs.get("caption_dropout_every_n_epochs", 0))
+    if caption_dropout_every_n_epochs > 0:
+        run_cmd += f' --caption_dropout_every_n_epochs="{caption_dropout_every_n_epochs}"'
+    
+    caption_dropout_rate = float(kwargs.get("caption_dropout_rate", 0))
+    if caption_dropout_rate > 0:
+        run_cmd += f' --caption_dropout_rate="{caption_dropout_rate}"'
+        
+    vae_batch_size = int(kwargs.get("vae_batch_size", 0))
+    if vae_batch_size > 0:
+        run_cmd += f' --vae_batch_size="{vae_batch_size}"'
+        
+    bucket_reso_steps = int(kwargs.get("bucket_reso_steps", 64))
+    run_cmd += f' --bucket_reso_steps={bucket_reso_steps}'
+        
+    save_every_n_steps = int(kwargs.get("save_every_n_steps", 0))
+    if save_every_n_steps > 0:
+        run_cmd += f' --save_every_n_steps="{save_every_n_steps}"'
+        
+    save_last_n_steps = int(kwargs.get("save_last_n_steps", 0))
+    if save_last_n_steps > 0:
+        run_cmd += f' --save_last_n_steps="{save_last_n_steps}"'
+        
+    save_last_n_steps_state = int(kwargs.get("save_last_n_steps_state", 0))
+    if save_last_n_steps_state > 0:
+        run_cmd += f' --save_last_n_steps_state="{save_last_n_steps_state}"'
+        
+    min_snr_gamma = int(kwargs.get("min_snr_gamma", 0))
+    if min_snr_gamma >= 1:
+        run_cmd += f' --min_snr_gamma={min_snr_gamma}'
+    
+    save_state = kwargs.get('save_state')
+    if save_state:
+        run_cmd += ' --save_state'
+        
+    mem_eff_attn = kwargs.get('mem_eff_attn')
+    if mem_eff_attn:
+        run_cmd += ' --mem_eff_attn'
+    
+    color_aug = kwargs.get('color_aug')
+    if color_aug:
+        run_cmd += ' --color_aug'
+    
+    flip_aug = kwargs.get('flip_aug')
+    if flip_aug:
+        run_cmd += ' --flip_aug'
+    
+    shuffle_caption = kwargs.get('shuffle_caption')
+    if shuffle_caption:
+        run_cmd += ' --shuffle_caption'
+    
+    gradient_checkpointing = kwargs.get('gradient_checkpointing')
+    if gradient_checkpointing:
+        run_cmd += ' --gradient_checkpointing'
+    
+    full_fp16 = kwargs.get('full_fp16')
+    if full_fp16:
+        run_cmd += ' --full_fp16'
+    
+    xformers = kwargs.get('xformers')
+    if xformers:
+        run_cmd += ' --xformers'
+    
+    persistent_data_loader_workers = kwargs.get('persistent_data_loader_workers')
+    if persistent_data_loader_workers:
+        run_cmd += ' --persistent_data_loader_workers'
+    
+    bucket_no_upscale = kwargs.get('bucket_no_upscale')
+    if bucket_no_upscale:
+        run_cmd += ' --bucket_no_upscale'
+    
+    random_crop = kwargs.get('random_crop')
+    if random_crop:
+        run_cmd += ' --random_crop'
+        
+    noise_offset_type = kwargs.get('noise_offset_type', 'Original')
+    if noise_offset_type == 'Original':
+        noise_offset = float(kwargs.get("noise_offset", 0))
+        if noise_offset > 0:
+            run_cmd += f' --noise_offset={noise_offset}'
+        
+        adaptive_noise_scale = float(kwargs.get("adaptive_noise_scale", 0))
+        if adaptive_noise_scale != 0 and noise_offset > 0:
+            run_cmd += f' --adaptive_noise_scale={adaptive_noise_scale}'
+    else:
+        multires_noise_iterations = int(kwargs.get("multires_noise_iterations", 0))
+        if multires_noise_iterations > 0:
+            run_cmd += f' --multires_noise_iterations="{multires_noise_iterations}"'
+        
+        multires_noise_discount = float(kwargs.get("multires_noise_discount", 0))
+        if multires_noise_discount > 0:
+            run_cmd += f' --multires_noise_discount="{multires_noise_discount}"'
+    
+    additional_parameters = kwargs.get("additional_parameters", "")
+    if additional_parameters:
+        run_cmd += f' {additional_parameters}'
+    
+    use_wandb = kwargs.get('use_wandb')
+    if use_wandb:
+        run_cmd += ' --log_with wandb'
+    
+    wandb_api_key = kwargs.get("wandb_api_key", "")
+    if wandb_api_key:
+        run_cmd += f' --wandb_api_key="{wandb_api_key}"'
+        
     return run_cmd
diff --git a/library/convert_model_gui.py b/library/convert_model_gui.py
index aaa39b87d..70b32e00e 100644
--- a/library/convert_model_gui.py
+++ b/library/convert_model_gui.py
@@ -19,6 +19,7 @@ def convert_model(
     target_model_name_input,
     target_model_type,
     target_save_precision_type,
+    unet_use_linear_projection,
 ):
     # Check for caption_text_input
     if source_model_type == '':
@@ -68,6 +69,14 @@ def convert_model(
     if target_model_type == 'diffuser_safetensors':
         run_cmd += ' --use_safetensors'
 
+    # Fix for stabilityAI diffusers format. When saving v2 models in Diffusers format in training scripts and conversion scripts,
+    # it was found that the U-Net configuration is different from those of Hugging Face's stabilityai models (this repository is
+    # "use_linear_projection": false, stabilityai is true). Please note that the weight shapes are different, so please be careful
+    # when using the weight files directly.
+
+    if unet_use_linear_projection:
+        run_cmd += ' --unet_use_linear_projection'
+
     run_cmd += f' "{source_model_input}"'
 
     if (
@@ -155,7 +164,7 @@ def convert_model(
 ###
 
 
-def gradio_convert_model_tab():
+def gradio_convert_model_tab(headless=False):
     with gr.Tab('Convert model'):
         gr.Markdown(
             'This utility can be used to convert from one stable diffusion model format to another.'
@@ -167,7 +176,9 @@ def gradio_convert_model_tab():
                 interactive=True,
             )
             button_source_model_dir = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             button_source_model_dir.click(
                 get_folder_path,
@@ -176,7 +187,9 @@ def gradio_convert_model_tab():
             )
 
             button_source_model_file = gr.Button(
-                document_symbol, elem_id='open_folder_small'
+                document_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             button_source_model_file.click(
                 get_file_path,
@@ -203,7 +216,9 @@ def gradio_convert_model_tab():
                 interactive=True,
             )
             button_target_model_folder = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             button_target_model_folder.click(
                 get_folder_path,
@@ -230,6 +245,11 @@ def gradio_convert_model_tab():
                 choices=['unspecified', 'fp16', 'bf16', 'float'],
                 value='unspecified',
             )
+            unet_use_linear_projection = gr.Checkbox(
+                label='UNet linear projection',
+                value=False,
+                info="Enable for Hugging Face's stabilityai models",
+            )
 
         convert_button = gr.Button('Convert model')
 
@@ -242,6 +262,7 @@ def gradio_convert_model_tab():
                 target_model_name_input,
                 target_model_type,
                 target_save_precision_type,
+                unet_use_linear_projection,
             ],
             show_progress=False,
         )
diff --git a/library/custom_train_functions.py b/library/custom_train_functions.py
index f3758b0cc..ed96bd31b 100644
--- a/library/custom_train_functions.py
+++ b/library/custom_train_functions.py
@@ -1,5 +1,6 @@
 import torch
 import argparse
+import random
 import re
 from typing import List, Optional, Union
 
@@ -18,6 +19,9 @@ def apply_snr_weight(loss, timesteps, noise_scheduler, gamma):
     return loss
 
 
+# TODO train_utilと分散しているのでどちらかに寄せる
+
+
 def add_custom_train_arguments(parser: argparse.ArgumentParser, support_weighted_captions: bool = True):
     parser.add_argument(
         "--min_snr_gamma",
@@ -342,3 +346,91 @@ def get_weighted_text_embeddings(
     text_embeddings = text_embeddings * (previous_mean / current_mean).unsqueeze(-1).unsqueeze(-1)
 
     return text_embeddings
+
+
+# https://wandb.ai/johnowhitaker/multires_noise/reports/Multi-Resolution-Noise-for-Diffusion-Model-Training--VmlldzozNjYyOTU2
+def pyramid_noise_like(noise, device, iterations=6, discount=0.4):
+    b, c, w, h = noise.shape  # EDIT: w and h get over-written, rename for a different variant!
+    u = torch.nn.Upsample(size=(w, h), mode="bilinear").to(device)
+    for i in range(iterations):
+        r = random.random() * 2 + 2  # Rather than always going 2x,
+        wn, hn = max(1, int(w / (r**i))), max(1, int(h / (r**i)))
+        noise += u(torch.randn(b, c, wn, hn).to(device)) * discount**i
+        if wn == 1 or hn == 1:
+            break  # Lowest resolution is 1x1
+    return noise / noise.std()  # Scaled back to roughly unit variance
+
+
+# https://www.crosslabs.org//blog/diffusion-with-offset-noise
+def apply_noise_offset(latents, noise, noise_offset, adaptive_noise_scale):
+    if noise_offset is None:
+        return noise
+    if adaptive_noise_scale is not None:
+        # latent shape: (batch_size, channels, height, width)
+        # abs mean value for each channel
+        latent_mean = torch.abs(latents.mean(dim=(2, 3), keepdim=True))
+
+        # multiply adaptive noise scale to the mean value and add it to the noise offset
+        noise_offset = noise_offset + adaptive_noise_scale * latent_mean
+        noise_offset = torch.clamp(noise_offset, 0.0, None)  # in case of adaptive noise scale is negative
+
+    noise = noise + noise_offset * torch.randn((latents.shape[0], latents.shape[1], 1, 1), device=latents.device)
+    return noise
+
+
+"""
+##########################################
+# Perlin Noise
+def rand_perlin_2d(device, shape, res, fade=lambda t: 6 * t**5 - 15 * t**4 + 10 * t**3):
+    delta = (res[0] / shape[0], res[1] / shape[1])
+    d = (shape[0] // res[0], shape[1] // res[1])
+
+    grid = (
+        torch.stack(
+            torch.meshgrid(torch.arange(0, res[0], delta[0], device=device), torch.arange(0, res[1], delta[1], device=device)),
+            dim=-1,
+        )
+        % 1
+    )
+    angles = 2 * torch.pi * torch.rand(res[0] + 1, res[1] + 1, device=device)
+    gradients = torch.stack((torch.cos(angles), torch.sin(angles)), dim=-1)
+
+    tile_grads = (
+        lambda slice1, slice2: gradients[slice1[0] : slice1[1], slice2[0] : slice2[1]]
+        .repeat_interleave(d[0], 0)
+        .repeat_interleave(d[1], 1)
+    )
+    dot = lambda grad, shift: (
+        torch.stack((grid[: shape[0], : shape[1], 0] + shift[0], grid[: shape[0], : shape[1], 1] + shift[1]), dim=-1)
+        * grad[: shape[0], : shape[1]]
+    ).sum(dim=-1)
+
+    n00 = dot(tile_grads([0, -1], [0, -1]), [0, 0])
+    n10 = dot(tile_grads([1, None], [0, -1]), [-1, 0])
+    n01 = dot(tile_grads([0, -1], [1, None]), [0, -1])
+    n11 = dot(tile_grads([1, None], [1, None]), [-1, -1])
+    t = fade(grid[: shape[0], : shape[1]])
+    return 1.414 * torch.lerp(torch.lerp(n00, n10, t[..., 0]), torch.lerp(n01, n11, t[..., 0]), t[..., 1])
+
+
+def rand_perlin_2d_octaves(device, shape, res, octaves=1, persistence=0.5):
+    noise = torch.zeros(shape, device=device)
+    frequency = 1
+    amplitude = 1
+    for _ in range(octaves):
+        noise += amplitude * rand_perlin_2d(device, shape, (frequency * res[0], frequency * res[1]))
+        frequency *= 2
+        amplitude *= persistence
+    return noise
+
+
+def perlin_noise(noise, device, octaves):
+    _, c, w, h = noise.shape
+    perlin = lambda: rand_perlin_2d_octaves(device, (w, h), (4, 4), octaves)
+    noise_perlin = []
+    for _ in range(c):
+        noise_perlin.append(perlin())
+    noise_perlin = torch.stack(noise_perlin).unsqueeze(0)   # (1, c, w, h)
+    noise += noise_perlin # broadcast for each batch
+    return noise / noise.std()  # Scaled back to roughly unit variance
+"""
diff --git a/library/dataset_balancing_gui.py b/library/dataset_balancing_gui.py
index f2418afad..de74e561e 100644
--- a/library/dataset_balancing_gui.py
+++ b/library/dataset_balancing_gui.py
@@ -44,9 +44,11 @@ def dataset_balancing(concept_repeats, folder, insecure):
 
             # Count the number of image files
             images = len(image_files)
-            
+
             if images == 0:
-                print(f'No images of type .jpg, .jpeg, .png, .gif, .webp were found in {os.listdir(os.path.join(folder, subdir))}')
+                print(
+                    f'No images of type .jpg, .jpeg, .png, .gif, .webp were found in {os.listdir(os.path.join(folder, subdir))}'
+                )
 
             # Check if the subdirectory name starts with a number inside braces,
             # indicating that the repeats value should be multiplied
@@ -102,7 +104,7 @@ def warning(insecure):
             return False
 
 
-def gradio_dataset_balancing_tab():
+def gradio_dataset_balancing_tab(headless=False):
     with gr.Tab('Dreambooth/LoRA Dataset balancing'):
         gr.Markdown(
             'This utility will ensure that each concept folder in the dataset folder is used equally during the training process of the dreambooth machine learning model, regardless of the number of images in each folder. It will do this by renaming the concept folders to indicate the number of times they should be repeated during training.'
@@ -118,7 +120,7 @@ def gradio_dataset_balancing_tab():
             )
 
             select_dataset_folder_button = gr.Button(
-                '📂', elem_id='open_folder_small'
+                '📂', elem_id='open_folder_small', visible=(not headless)
             )
             select_dataset_folder_button.click(
                 get_folder_path,
diff --git a/library/dreambooth_folder_creation_gui.py b/library/dreambooth_folder_creation_gui.py
index b5d5ff49a..01df33d5f 100644
--- a/library/dreambooth_folder_creation_gui.py
+++ b/library/dreambooth_folder_creation_gui.py
@@ -114,6 +114,7 @@ def gradio_dreambooth_folder_creation_tab(
     reg_data_dir_input=gr.Textbox(),
     output_dir_input=gr.Textbox(),
     logging_dir_input=gr.Textbox(),
+    headless=False,
 ):
     with gr.Tab('Dreambooth/LoRA Folder preparation'):
         gr.Markdown(
@@ -137,7 +138,7 @@ def gradio_dreambooth_folder_creation_tab(
                 interactive=True,
             )
             button_util_training_images_dir_input = gr.Button(
-                '📂', elem_id='open_folder_small'
+                '📂', elem_id='open_folder_small', visible=(not headless)
             )
             button_util_training_images_dir_input.click(
                 get_folder_path,
@@ -157,7 +158,7 @@ def gradio_dreambooth_folder_creation_tab(
                 interactive=True,
             )
             button_util_regularization_images_dir_input = gr.Button(
-                '📂', elem_id='open_folder_small'
+                '📂', elem_id='open_folder_small', visible=(not headless)
             )
             button_util_regularization_images_dir_input.click(
                 get_folder_path,
@@ -177,7 +178,7 @@ def gradio_dreambooth_folder_creation_tab(
                 interactive=True,
             )
             button_util_training_dir_output = gr.Button(
-                '📂', elem_id='open_folder_small'
+                '📂', elem_id='open_folder_small', visible=(not headless)
             )
             button_util_training_dir_output.click(
                 get_folder_path, outputs=util_training_dir_output
diff --git a/library/extract_lora_from_dylora_gui.py b/library/extract_lora_from_dylora_gui.py
index c7ad8f6eb..4bd70fc2e 100644
--- a/library/extract_lora_from_dylora_gui.py
+++ b/library/extract_lora_from_dylora_gui.py
@@ -46,15 +46,16 @@ def extract_dylora(
 
     print('Done extracting DyLoRA...')
 
+
 ###
 # Gradio UI
 ###
 
 
-def gradio_extract_dylora_tab():
+def gradio_extract_dylora_tab(headless=False):
     with gr.Tab('Extract DyLoRA'):
         gr.Markdown(
-            'This utility can extract a LoRA network from a finetuned model.'
+            'This utility can extract a DyLoRA network from a finetuned model.'
         )
         lora_ext = gr.Textbox(value='*.safetensors *.pt', visible=False)
         lora_ext_name = gr.Textbox(value='LoRA model types', visible=False)
@@ -66,7 +67,9 @@ def gradio_extract_dylora_tab():
                 interactive=True,
             )
             button_model_file = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             button_model_file.click(
                 get_file_path,
@@ -81,7 +84,9 @@ def gradio_extract_dylora_tab():
                 interactive=True,
             )
             button_save_to = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             button_save_to.click(
                 get_saveasfilename_path,
diff --git a/library/extract_lora_gui.py b/library/extract_lora_gui.py
index 53292d354..81c72f1ad 100644
--- a/library/extract_lora_gui.py
+++ b/library/extract_lora_gui.py
@@ -71,7 +71,7 @@ def extract_lora(
 ###
 
 
-def gradio_extract_lora_tab():
+def gradio_extract_lora_tab(headless=False):
     with gr.Tab('Extract LoRA'):
         gr.Markdown(
             'This utility can extract a LoRA network from a finetuned model.'
@@ -88,7 +88,9 @@ def gradio_extract_lora_tab():
                 interactive=True,
             )
             button_model_tuned_file = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             button_model_tuned_file.click(
                 get_file_path,
@@ -103,7 +105,9 @@ def gradio_extract_lora_tab():
                 interactive=True,
             )
             button_model_org_file = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             button_model_org_file.click(
                 get_file_path,
@@ -118,7 +122,9 @@ def gradio_extract_lora_tab():
                 interactive=True,
             )
             button_save_to = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             button_save_to.click(
                 get_saveasfilename_path,
@@ -172,7 +178,7 @@ def gradio_extract_lora_tab():
                 dim,
                 v2,
                 conv_dim,
-                device
+                device,
             ],
             show_progress=False,
         )
diff --git a/library/extract_lycoris_locon_gui.py b/library/extract_lycoris_locon_gui.py
index 13575bbcb..a0aaff8ac 100644
--- a/library/extract_lycoris_locon_gui.py
+++ b/library/extract_lycoris_locon_gui.py
@@ -58,14 +58,18 @@ def extract_lycoris_locon(
     run_cmd += f' --device {device}'
     run_cmd += f' --mode {mode}'
     run_cmd += f' --safetensors'
-    run_cmd += f' --linear_dim {linear_dim}'
-    run_cmd += f' --conv_dim {conv_dim}'
-    run_cmd += f' --linear_threshold {linear_threshold}'
-    run_cmd += f' --conv_threshold {conv_threshold}'
-    run_cmd += f' --linear_ratio {linear_ratio}'
-    run_cmd += f' --conv_ratio {conv_ratio}'
-    run_cmd += f' --linear_quantile {linear_quantile}'
-    run_cmd += f' --conv_quantile {conv_quantile}'
+    if mode == 'fixed':
+        run_cmd += f' --linear_dim {linear_dim}'
+        run_cmd += f' --conv_dim {conv_dim}'
+    if mode == 'threshold':
+        run_cmd += f' --linear_threshold {linear_threshold}'
+        run_cmd += f' --conv_threshold {conv_threshold}'
+    if mode == 'ratio':
+        run_cmd += f' --linear_ratio {linear_ratio}'
+        run_cmd += f' --conv_ratio {conv_ratio}'
+    if mode == 'quantile':
+        run_cmd += f' --linear_quantile {linear_quantile}'
+        run_cmd += f' --conv_quantile {conv_quantile}'
     if use_sparse_bias:
         run_cmd += f' --use_sparse_bias'
     run_cmd += f' --sparsity {sparsity}'
@@ -115,7 +119,7 @@ def update_mode(mode):
     return tuple(updates)
 
 
-def gradio_extract_lycoris_locon_tab():
+def gradio_extract_lycoris_locon_tab(headless=False):
     with gr.Tab('Extract LyCORIS LoCON'):
         gr.Markdown(
             'This utility can extract a LyCORIS LoCon network from a finetuned model.'
@@ -134,7 +138,9 @@ def gradio_extract_lycoris_locon_tab():
                 interactive=True,
             )
             button_db_model_file = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             button_db_model_file.click(
                 get_file_path,
@@ -149,7 +155,9 @@ def gradio_extract_lycoris_locon_tab():
                 interactive=True,
             )
             button_base_model_file = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             button_base_model_file.click(
                 get_file_path,
@@ -164,7 +172,9 @@ def gradio_extract_lycoris_locon_tab():
                 interactive=True,
             )
             button_output_name = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             button_output_name.click(
                 get_saveasfilename_path,
@@ -210,34 +220,38 @@ def gradio_extract_lycoris_locon_tab():
                 minimum=0,
                 maximum=1,
                 label='Linear threshold',
-                value=0,
+                value=0.65,
                 step=0.01,
                 interactive=True,
+                info='The higher the value, the smaller the file. Recommended starting value: 0.65',
             )
             conv_threshold = gr.Slider(
                 minimum=0,
                 maximum=1,
                 label='Conv threshold',
-                value=0,
+                value=0.65,
                 step=0.01,
                 interactive=True,
+                info='The higher the value, the smaller the file. Recommended starting value: 0.65',
             )
         with gr.Row(visible=False) as ratio:
             linear_ratio = gr.Slider(
                 minimum=0,
                 maximum=1,
                 label='Linear ratio',
-                value=0,
+                value=0.75,
                 step=0.01,
                 interactive=True,
+                info='The higher the value, the smaller the file. Recommended starting value: 0.75',
             )
             conv_ratio = gr.Slider(
                 minimum=0,
                 maximum=1,
                 label='Conv ratio',
-                value=0,
+                value=0.75,
                 step=0.01,
                 interactive=True,
+                info='The higher the value, the smaller the file. Recommended starting value: 0.75',
             )
         with gr.Row(visible=False) as quantile:
             linear_quantile = gr.Slider(
@@ -247,6 +261,7 @@ def gradio_extract_lycoris_locon_tab():
                 value=0.75,
                 step=0.01,
                 interactive=True,
+                info='The higher the value, the larger the file. Recommended starting value: 0.75',
             )
             conv_quantile = gr.Slider(
                 minimum=0,
@@ -255,6 +270,7 @@ def gradio_extract_lycoris_locon_tab():
                 value=0.75,
                 step=0.01,
                 interactive=True,
+                info='The higher the value, the larger the file. Recommended starting value: 0.75',
             )
         with gr.Row():
             use_sparse_bias = gr.Checkbox(
diff --git a/library/git_caption_gui.py b/library/git_caption_gui.py
index 03e06b7f0..1159bff0d 100644
--- a/library/git_caption_gui.py
+++ b/library/git_caption_gui.py
@@ -27,9 +27,7 @@ def caption_images(
         return
 
     print(f'GIT captioning files in {train_data_dir}...')
-    run_cmd = (
-        f'{PYTHON} finetune/make_captions_by_git.py'
-    )
+    run_cmd = f'{PYTHON} finetune/make_captions_by_git.py'
     if not model_id == '':
         run_cmd += f' --model_id="{model_id}"'
     run_cmd += f' --batch_size="{int(batch_size)}"'
@@ -65,7 +63,7 @@ def caption_images(
 ###
 
 
-def gradio_git_caption_gui_tab():
+def gradio_git_caption_gui_tab(headless=False):
     with gr.Tab('GIT Captioning'):
         gr.Markdown(
             'This utility will use GIT to caption files for each images in a folder.'
@@ -77,7 +75,7 @@ def gradio_git_caption_gui_tab():
                 interactive=True,
             )
             button_train_data_dir_input = gr.Button(
-                '📂', elem_id='open_folder_small'
+                '📂', elem_id='open_folder_small', visible=(not headless)
             )
             button_train_data_dir_input.click(
                 get_folder_path,
diff --git a/library/group_images_gui.py b/library/group_images_gui.py
new file mode 100644
index 000000000..a100e32ae
--- /dev/null
+++ b/library/group_images_gui.py
@@ -0,0 +1,110 @@
+import gradio as gr
+from easygui import msgbox
+import subprocess
+from .common_gui import get_folder_path
+import os
+
+PYTHON = 'python3' if os.name == 'posix' else './venv/Scripts/python.exe'
+
+def group_images(
+    input_folder,
+    output_folder,
+    group_size,
+    include_subfolders,
+    do_not_copy_other_files
+):
+    if input_folder == '':
+        msgbox('Input folder is missing...')
+        return
+
+    if output_folder == '':
+        msgbox('Please provide an output folder.')
+        return
+
+    print(f'Grouping images in {input_folder}...')
+
+    run_cmd = f'{PYTHON} "{os.path.join("tools","group_images.py")}"'
+    run_cmd += f' "{input_folder}"'
+    run_cmd += f' "{output_folder}"'
+    run_cmd += f' {(group_size)}'
+    if include_subfolders:
+        run_cmd += f' --include_subfolders'
+    if do_not_copy_other_files:
+        run_cmd += f' --do_not_copy_other_files'
+
+    print(run_cmd)
+
+    if os.name == 'posix':
+        os.system(run_cmd)
+    else:
+        subprocess.run(run_cmd)
+
+    print('...grouping done')
+
+
+def gradio_group_images_gui_tab(headless=False):
+    with gr.Tab('Group Images'):
+        gr.Markdown('This utility will group images in a folder based on their aspect ratio.')
+        
+        with gr.Row():
+            input_folder = gr.Textbox(
+                label='Input folder',
+                placeholder='Directory containing the images to group',
+                interactive=True,
+            )
+            button_input_folder = gr.Button(
+                '📂', elem_id='open_folder_small', visible=(not headless)
+            )
+            button_input_folder.click(
+                get_folder_path,
+                outputs=input_folder,
+                show_progress=False,
+            )
+
+            output_folder = gr.Textbox(
+                label='Output folder',
+                placeholder='Directory where the grouped images will be stored',
+                interactive=True,
+            )
+            button_output_folder = gr.Button(
+                '📂', elem_id='open_folder_small', visible=(not headless)
+            )
+            button_output_folder.click(
+                get_folder_path,
+                outputs=output_folder,
+                show_progress=False,
+            )
+        with gr.Row():
+            group_size = gr.Slider(
+                label='Group size',
+                info='Number of images to group together',
+                value='4',
+                minimum=1, maximum=64, step=1,
+                interactive=True,
+            )
+
+            include_subfolders = gr.Checkbox(
+                label='Include Subfolders',
+                value=False,
+                info='Include images in subfolders as well',
+            )
+
+            do_not_copy_other_files = gr.Checkbox(
+                label='Do not copy other files',
+                value=False,
+                info='Do not copy other files in the input folder to the output folder',
+            )
+
+        group_images_button = gr.Button('Group images')
+
+        group_images_button.click(
+            group_images,
+            inputs=[
+                input_folder,
+                output_folder,
+                group_size,
+                include_subfolders,
+                do_not_copy_other_files
+            ],
+            show_progress=False,
+        )
diff --git a/library/huggingface_util.py b/library/huggingface_util.py
index 41031b1ff..1dc496ff5 100644
--- a/library/huggingface_util.py
+++ b/library/huggingface_util.py
@@ -1,15 +1,12 @@
-from typing import *
+from typing import Union, BinaryIO
 from huggingface_hub import HfApi
 from pathlib import Path
 import argparse
 import os
-
 from library.utils import fire_in_thread
 
 
-def exists_repo(
-    repo_id: str, repo_type: str, revision: str = "main", token: str = None
-):
+def exists_repo(repo_id: str, repo_type: str, revision: str = "main", token: str = None):
     api = HfApi(
         token=token,
     )
@@ -33,27 +30,35 @@ def upload(
     private = args.huggingface_repo_visibility is None or args.huggingface_repo_visibility != "public"
     api = HfApi(token=token)
     if not exists_repo(repo_id=repo_id, repo_type=repo_type, token=token):
-        api.create_repo(repo_id=repo_id, repo_type=repo_type, private=private)
+        try:
+            api.create_repo(repo_id=repo_id, repo_type=repo_type, private=private)
+        except Exception as e:  # とりあえずRepositoryNotFoundErrorは確認したが他にあると困るので
+            print("===========================================")
+            print(f"failed to create HuggingFace repo / HuggingFaceのリポジトリの作成に失敗しました : {e}")
+            print("===========================================")
 
-    is_folder = (type(src) == str and os.path.isdir(src)) or (
-        isinstance(src, Path) and src.is_dir()
-    )
+    is_folder = (type(src) == str and os.path.isdir(src)) or (isinstance(src, Path) and src.is_dir())
 
     def uploader():
-        if is_folder:
-            api.upload_folder(
-                repo_id=repo_id,
-                repo_type=repo_type,
-                folder_path=src,
-                path_in_repo=path_in_repo,
-            )
-        else:
-            api.upload_file(
-                repo_id=repo_id,
-                repo_type=repo_type,
-                path_or_fileobj=src,
-                path_in_repo=path_in_repo,
-            )
+        try:
+            if is_folder:
+                api.upload_folder(
+                    repo_id=repo_id,
+                    repo_type=repo_type,
+                    folder_path=src,
+                    path_in_repo=path_in_repo,
+                )
+            else:
+                api.upload_file(
+                    repo_id=repo_id,
+                    repo_type=repo_type,
+                    path_or_fileobj=src,
+                    path_in_repo=path_in_repo,
+                )
+        except Exception as e:  # RuntimeErrorを確認済みだが他にあると困るので
+            print("===========================================")
+            print(f"failed to upload to HuggingFace / HuggingFaceへのアップロードに失敗しました : {e}")
+            print("===========================================")
 
     if args.async_upload and not force_sync_upload:
         fire_in_thread(uploader)
@@ -72,7 +77,5 @@ def list_dir(
         token=token,
     )
     repo_info = api.repo_info(repo_id=repo_id, revision=revision, repo_type=repo_type)
-    file_list = [
-        file for file in repo_info.siblings if file.rfilename.startswith(subfolder)
-    ]
+    file_list = [file for file in repo_info.siblings if file.rfilename.startswith(subfolder)]
     return file_list
diff --git a/library/merge_lora_gui.py b/library/merge_lora_gui.py
index f4456bc71..3031aee25 100644
--- a/library/merge_lora_gui.py
+++ b/library/merge_lora_gui.py
@@ -14,6 +14,7 @@
 document_symbol = '\U0001F4C4'   # 📄
 PYTHON = 'python3' if os.name == 'posix' else './venv/Scripts/python.exe'
 
+
 def check_model(model):
     if not model:
         return True
@@ -22,6 +23,7 @@ def check_model(model):
         return False
     return True
 
+
 def verify_conditions(sd_model, lora_models):
     lora_models_count = sum(1 for model in lora_models if model)
     if sd_model and lora_models_count >= 1:
@@ -51,7 +53,9 @@ def merge_lora(
     ratios = [ratio_a, ratio_b, ratio_c, ratio_d]
 
     if not verify_conditions(sd_model, lora_models):
-        print("Warning: Either provide at least one LoRa model along with the sd_model or at least two LoRa models if no sd_model is provided.")
+        print(
+            'Warning: Either provide at least one LoRa model along with the sd_model or at least two LoRa models if no sd_model is provided.'
+        )
         return
 
     for model in models:
@@ -86,14 +90,17 @@ def merge_lora(
 
     print('Done merging...')
 
+
 ###
 # Gradio UI
 ###
 
 
-def gradio_merge_lora_tab():
+def gradio_merge_lora_tab(headless=False):
     with gr.Tab('Merge LoRA'):
-        gr.Markdown('This utility can merge up to 4 LoRA together or alternativelly merge up to 4 LoRA into a SD checkpoint.')
+        gr.Markdown(
+            'This utility can merge up to 4 LoRA together or alternativelly merge up to 4 LoRA into a SD checkpoint.'
+        )
 
         lora_ext = gr.Textbox(value='*.safetensors *.pt', visible=False)
         lora_ext_name = gr.Textbox(value='LoRA model types', visible=False)
@@ -105,10 +112,12 @@ def gradio_merge_lora_tab():
                 label='SD Model',
                 placeholder='(Optional) Stable Diffusion model',
                 interactive=True,
-                info='Provide a SD file path IF you want to merge it with LoRA files'
+                info='Provide a SD file path IF you want to merge it with LoRA files',
             )
             sd_model_file = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             sd_model_file.click(
                 get_file_path,
@@ -116,7 +125,7 @@ def gradio_merge_lora_tab():
                 outputs=sd_model,
                 show_progress=False,
             )
-            
+
         with gr.Row():
             lora_a_model = gr.Textbox(
                 label='LoRA model "A"',
@@ -124,7 +133,9 @@ def gradio_merge_lora_tab():
                 interactive=True,
             )
             button_lora_a_model_file = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             button_lora_a_model_file.click(
                 get_file_path,
@@ -139,7 +150,9 @@ def gradio_merge_lora_tab():
                 interactive=True,
             )
             button_lora_b_model_file = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             button_lora_b_model_file.click(
                 get_file_path,
@@ -147,7 +160,7 @@ def gradio_merge_lora_tab():
                 outputs=lora_b_model,
                 show_progress=False,
             )
-            
+
         with gr.Row():
             ratio_a = gr.Slider(
                 label='Model A merge ratio (eg: 0.5 mean 50%)',
@@ -157,7 +170,7 @@ def gradio_merge_lora_tab():
                 value=0.0,
                 interactive=True,
             )
-            
+
             ratio_b = gr.Slider(
                 label='Model B merge ratio (eg: 0.5 mean 50%)',
                 minimum=0,
@@ -166,7 +179,7 @@ def gradio_merge_lora_tab():
                 value=0.0,
                 interactive=True,
             )
-            
+
         with gr.Row():
             lora_c_model = gr.Textbox(
                 label='LoRA model "C"',
@@ -174,7 +187,9 @@ def gradio_merge_lora_tab():
                 interactive=True,
             )
             button_lora_c_model_file = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             button_lora_c_model_file.click(
                 get_file_path,
@@ -182,14 +197,16 @@ def gradio_merge_lora_tab():
                 outputs=lora_c_model,
                 show_progress=False,
             )
-            
+
             lora_d_model = gr.Textbox(
                 label='LoRA model "D"',
                 placeholder='Path to the LoRA D model',
                 interactive=True,
             )
             button_lora_d_model_file = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             button_lora_d_model_file.click(
                 get_file_path,
@@ -197,7 +214,7 @@ def gradio_merge_lora_tab():
                 outputs=lora_d_model,
                 show_progress=False,
             )
-            
+
         with gr.Row():
             ratio_c = gr.Slider(
                 label='Model C merge ratio (eg: 0.5 mean 50%)',
@@ -207,7 +224,7 @@ def gradio_merge_lora_tab():
                 value=0.0,
                 interactive=True,
             )
-            
+
             ratio_d = gr.Slider(
                 label='Model D merge ratio (eg: 0.5 mean 50%)',
                 minimum=0,
@@ -224,7 +241,9 @@ def gradio_merge_lora_tab():
                 interactive=True,
             )
             button_save_to = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             button_save_to.click(
                 get_saveasfilename_path,
diff --git a/library/merge_lycoris_gui.py b/library/merge_lycoris_gui.py
index ed07528d7..0fd8e1522 100644
--- a/library/merge_lycoris_gui.py
+++ b/library/merge_lycoris_gui.py
@@ -13,6 +13,7 @@
 document_symbol = '\U0001F4C4'   # 📄
 PYTHON = 'python3' if os.name == 'posix' else './venv/Scripts/python.exe'
 
+
 def merge_lycoris(
     base_model,
     lycoris_model,
@@ -44,14 +45,17 @@ def merge_lycoris(
 
     print('Done merging...')
 
+
 ###
 # Gradio UI
 ###
 
 
-def gradio_merge_lycoris_tab():
+def gradio_merge_lycoris_tab(headless=False):
     with gr.Tab('Merge LyCORIS'):
-        gr.Markdown('This utility can merge a LyCORIS model into a SD checkpoint.')
+        gr.Markdown(
+            'This utility can merge a LyCORIS model into a SD checkpoint.'
+        )
 
         lora_ext = gr.Textbox(value='*.safetensors *.pt', visible=False)
         lora_ext_name = gr.Textbox(value='LoRA model types', visible=False)
@@ -63,10 +67,12 @@ def gradio_merge_lycoris_tab():
                 label='SD Model',
                 placeholder='(Optional) Stable Diffusion base model',
                 interactive=True,
-                info='Provide a SD file path that you want to merge with the LyCORIS file'
+                info='Provide a SD file path that you want to merge with the LyCORIS file',
             )
             base_model_file = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             base_model_file.click(
                 get_file_path,
@@ -74,7 +80,7 @@ def gradio_merge_lycoris_tab():
                 outputs=base_model,
                 show_progress=False,
             )
-            
+
         with gr.Row():
             lycoris_model = gr.Textbox(
                 label='LyCORIS model',
@@ -82,7 +88,9 @@ def gradio_merge_lycoris_tab():
                 interactive=True,
             )
             button_lycoris_model_file = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             button_lycoris_model_file.click(
                 get_file_path,
@@ -90,7 +98,7 @@ def gradio_merge_lycoris_tab():
                 outputs=lycoris_model,
                 show_progress=False,
             )
-            
+
         with gr.Row():
             weight = gr.Slider(
                 label='Model A merge ratio (eg: 0.5 mean 50%)',
@@ -108,7 +116,9 @@ def gradio_merge_lycoris_tab():
                 interactive=True,
             )
             button_output_name = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             button_output_name.click(
                 get_saveasfilename_path,
@@ -118,21 +128,28 @@ def gradio_merge_lycoris_tab():
             )
             dtype = gr.Dropdown(
                 label='Save dtype',
-                choices=['float', 'float16', 'float32', 'float64', 'bfloat', 'bfloat16'],
+                choices=[
+                    'float',
+                    'float16',
+                    'float32',
+                    'float64',
+                    'bfloat',
+                    'bfloat16',
+                ],
                 value='float16',
                 interactive=True,
             )
-            
+
             device = gr.Dropdown(
                 label='Device',
                 choices=[
                     'cpu',
-                  #  'cuda',
+                    #  'cuda',
                 ],
                 value='cpu',
                 interactive=True,
             )
-            
+
             is_v2 = gr.Checkbox(label='is v2', value=False, interactive=True)
 
         merge_button = gr.Button('Merge model')
diff --git a/library/model_util.py b/library/model_util.py
index 35b0b6afe..26f72235d 100644
--- a/library/model_util.py
+++ b/library/model_util.py
@@ -22,6 +22,7 @@
 UNET_PARAMS_NUM_RES_BLOCKS = 2
 UNET_PARAMS_CONTEXT_DIM = 768
 UNET_PARAMS_NUM_HEADS = 8
+# UNET_PARAMS_USE_LINEAR_PROJECTION = False
 
 VAE_PARAMS_Z_CHANNELS = 4
 VAE_PARAMS_RESOLUTION = 256
@@ -34,6 +35,7 @@
 # V2
 V2_UNET_PARAMS_ATTENTION_HEAD_DIM = [5, 10, 20, 20]
 V2_UNET_PARAMS_CONTEXT_DIM = 1024
+# V2_UNET_PARAMS_USE_LINEAR_PROJECTION = True
 
 # Diffusersの設定を読み込むための参照モデル
 DIFFUSERS_REF_MODEL_ID_V1 = "runwayml/stable-diffusion-v1-5"
@@ -357,8 +359,9 @@ def convert_ldm_unet_checkpoint(v2, checkpoint, config):
 
                 new_checkpoint[new_path] = unet_state_dict[old_path]
 
-    # SDのv2では1*1のconv2dがlinearに変わっているので、linear->convに変換する
-    if v2:
+    # SDのv2では1*1のconv2dがlinearに変わっている
+    # 誤って Diffusers 側を conv2d のままにしてしまったので、変換必要
+    if v2 and not config.get('use_linear_projection', False):
         linear_transformer_to_conv(new_checkpoint)
 
     return new_checkpoint
@@ -468,7 +471,7 @@ def convert_ldm_vae_checkpoint(checkpoint, config):
     return new_checkpoint
 
 
-def create_unet_diffusers_config(v2):
+def create_unet_diffusers_config(v2, use_linear_projection_in_v2=False):
     """
     Creates a config for the diffusers based on the config of the LDM model.
     """
@@ -500,7 +503,10 @@ def create_unet_diffusers_config(v2):
         layers_per_block=UNET_PARAMS_NUM_RES_BLOCKS,
         cross_attention_dim=UNET_PARAMS_CONTEXT_DIM if not v2 else V2_UNET_PARAMS_CONTEXT_DIM,
         attention_head_dim=UNET_PARAMS_NUM_HEADS if not v2 else V2_UNET_PARAMS_ATTENTION_HEAD_DIM,
+        # use_linear_projection=UNET_PARAMS_USE_LINEAR_PROJECTION if not v2 else V2_UNET_PARAMS_USE_LINEAR_PROJECTION,
     )
+    if v2 and use_linear_projection_in_v2:
+        config["use_linear_projection"] = True
 
     return config
 
@@ -846,11 +852,11 @@ def load_checkpoint_with_text_encoder_conversion(ckpt_path, device="cpu"):
 
 
 # TODO dtype指定の動作が怪しいので確認する text_encoderを指定形式で作れるか未確認
-def load_models_from_stable_diffusion_checkpoint(v2, ckpt_path, device="cpu", dtype=None):
+def load_models_from_stable_diffusion_checkpoint(v2, ckpt_path, device="cpu", dtype=None, unet_use_linear_projection_in_v2=False):
     _, state_dict = load_checkpoint_with_text_encoder_conversion(ckpt_path, device)
 
     # Convert the UNet2DConditionModel model.
-    unet_config = create_unet_diffusers_config(v2)
+    unet_config = create_unet_diffusers_config(v2, unet_use_linear_projection_in_v2)
     converted_unet_checkpoint = convert_ldm_unet_checkpoint(v2, state_dict, unet_config)
 
     unet = UNet2DConditionModel(**unet_config).to(device)
diff --git a/library/resize_lora_gui.py b/library/resize_lora_gui.py
index 0e756080c..9f11f1e09 100644
--- a/library/resize_lora_gui.py
+++ b/library/resize_lora_gui.py
@@ -73,13 +73,14 @@ def resize_lora(
         subprocess.run(run_cmd)
 
     print('Done resizing...')
-    
+
+
 ###
 # Gradio UI
 ###
 
 
-def gradio_resize_lora_tab():
+def gradio_resize_lora_tab(headless=False):
     with gr.Tab('Resize LoRA'):
         gr.Markdown('This utility can resize a LoRA.')
 
@@ -93,7 +94,9 @@ def gradio_resize_lora_tab():
                 interactive=True,
             )
             button_lora_a_model_file = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             button_lora_a_model_file.click(
                 get_file_path,
@@ -132,7 +135,9 @@ def gradio_resize_lora_tab():
                 interactive=True,
             )
             button_save_to = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             button_save_to.click(
                 get_saveasfilename_path,
diff --git a/library/slicing_vae.py b/library/slicing_vae.py
new file mode 100644
index 000000000..490b5a75d
--- /dev/null
+++ b/library/slicing_vae.py
@@ -0,0 +1,679 @@
+# Modified from Diffusers to reduce VRAM usage
+
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import torch
+import torch.nn as nn
+
+
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.modeling_utils import ModelMixin
+from diffusers.utils import BaseOutput
+from diffusers.models.unet_2d_blocks import UNetMidBlock2D, get_down_block, get_up_block, ResnetBlock2D
+from diffusers.models.vae import DecoderOutput, Encoder, AutoencoderKLOutput, DiagonalGaussianDistribution
+
+
+def slice_h(x, num_slices):
+    # slice with pad 1 both sides: to eliminate side effect of padding of conv2d
+    # Conv2dのpaddingの副作用を排除するために、両側にpad 1しながらHをスライスする
+    # NCHWでもNHWCでもどちらでも動く
+    size = (x.shape[2] + num_slices - 1) // num_slices
+    sliced = []
+    for i in range(num_slices):
+        if i == 0:
+            sliced.append(x[:, :, : size + 1, :])
+        else:
+            end = size * (i + 1) + 1
+            if x.shape[2] - end < 3:  # if the last slice is too small, use the rest of the tensor 最後が細すぎるとconv2dできないので全部使う
+                end = x.shape[2]
+            sliced.append(x[:, :, size * i - 1 : end, :])
+            if end >= x.shape[2]:
+                break
+    return sliced
+
+
+def cat_h(sliced):
+    # padding分を除いて結合する
+    cat = []
+    for i, x in enumerate(sliced):
+        if i == 0:
+            cat.append(x[:, :, :-1, :])
+        elif i == len(sliced) - 1:
+            cat.append(x[:, :, 1:, :])
+        else:
+            cat.append(x[:, :, 1:-1, :])
+        del x
+    x = torch.cat(cat, dim=2)
+    return x
+
+
+def resblock_forward(_self, num_slices, input_tensor, temb):
+    assert _self.upsample is None and _self.downsample is None
+    assert _self.norm1.num_groups == _self.norm2.num_groups
+    assert temb is None
+
+    # make sure norms are on cpu
+    org_device = input_tensor.device
+    cpu_device = torch.device("cpu")
+    _self.norm1.to(cpu_device)
+    _self.norm2.to(cpu_device)
+
+    # GroupNormがCPUでfp16で動かない対策
+    org_dtype = input_tensor.dtype
+    if org_dtype == torch.float16:
+        _self.norm1.to(torch.float32)
+        _self.norm2.to(torch.float32)
+
+    # すべてのテンソルをCPUに移動する
+    input_tensor = input_tensor.to(cpu_device)
+    hidden_states = input_tensor
+
+    # どうもこれは結果が異なるようだ……
+    # def sliced_norm1(norm, x):
+    #     num_div = 4 if up_block_idx <= 2 else x.shape[1] // norm.num_groups
+    #     sliced_tensor = torch.chunk(x, num_div, dim=1)
+    #     sliced_weight = torch.chunk(norm.weight, num_div, dim=0)
+    #     sliced_bias = torch.chunk(norm.bias, num_div, dim=0)
+    #     print(sliced_tensor[0].shape, num_div, sliced_weight[0].shape, sliced_bias[0].shape)
+    #     normed_tensor = []
+    #     for i in range(num_div):
+    #         n = torch.group_norm(sliced_tensor[i], norm.num_groups, sliced_weight[i], sliced_bias[i], norm.eps)
+    #         normed_tensor.append(n)
+    #         del n
+    #     x = torch.cat(normed_tensor, dim=1)
+    #     return num_div, x
+
+    # normを分割すると結果が変わるので、ここだけは分割しない。GPUで計算するとVRAMが足りなくなるので、CPUで計算する。幸いCPUでもそこまで遅くない
+    if org_dtype == torch.float16:
+        hidden_states = hidden_states.to(torch.float32)
+    hidden_states = _self.norm1(hidden_states)  # run on cpu
+    if org_dtype == torch.float16:
+        hidden_states = hidden_states.to(torch.float16)
+
+    sliced = slice_h(hidden_states, num_slices)
+    del hidden_states
+
+    for i in range(len(sliced)):
+        x = sliced[i]
+        sliced[i] = None
+
+        # 計算する部分だけGPUに移動する、以下同様
+        x = x.to(org_device)
+        x = _self.nonlinearity(x)
+        x = _self.conv1(x)
+        x = x.to(cpu_device)
+        sliced[i] = x
+        del x
+
+    hidden_states = cat_h(sliced)
+    del sliced
+
+    if org_dtype == torch.float16:
+        hidden_states = hidden_states.to(torch.float32)
+    hidden_states = _self.norm2(hidden_states)  # run on cpu
+    if org_dtype == torch.float16:
+        hidden_states = hidden_states.to(torch.float16)
+
+    sliced = slice_h(hidden_states, num_slices)
+    del hidden_states
+
+    for i in range(len(sliced)):
+        x = sliced[i]
+        sliced[i] = None
+
+        x = x.to(org_device)
+        x = _self.nonlinearity(x)
+        x = _self.dropout(x)
+        x = _self.conv2(x)
+        x = x.to(cpu_device)
+        sliced[i] = x
+        del x
+
+    hidden_states = cat_h(sliced)
+    del sliced
+
+    # make shortcut
+    if _self.conv_shortcut is not None:
+        sliced = list(torch.chunk(input_tensor, num_slices, dim=2))  # no padding in conv_shortcut パディングがないので普通にスライスする
+        del input_tensor
+
+        for i in range(len(sliced)):
+            x = sliced[i]
+            sliced[i] = None
+
+            x = x.to(org_device)
+            x = _self.conv_shortcut(x)
+            x = x.to(cpu_device)
+            sliced[i] = x
+            del x
+
+        input_tensor = torch.cat(sliced, dim=2)
+        del sliced
+
+    output_tensor = (input_tensor + hidden_states) / _self.output_scale_factor
+
+    output_tensor = output_tensor.to(org_device)  # 次のレイヤーがGPUで計算する
+    return output_tensor
+
+
+class SlicingEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels=3,
+        out_channels=3,
+        down_block_types=("DownEncoderBlock2D",),
+        block_out_channels=(64,),
+        layers_per_block=2,
+        norm_num_groups=32,
+        act_fn="silu",
+        double_z=True,
+        num_slices=2,
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+
+        self.conv_in = torch.nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, stride=1, padding=1)
+
+        self.mid_block = None
+        self.down_blocks = nn.ModuleList([])
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=self.layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                add_downsample=not is_final_block,
+                resnet_eps=1e-6,
+                downsample_padding=0,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attn_num_head_channels=None,
+                temb_channels=None,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = UNetMidBlock2D(
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default",
+            attn_num_head_channels=None,
+            resnet_groups=norm_num_groups,
+            temb_channels=None,
+        )
+        self.mid_block.attentions[0].set_use_memory_efficient_attention_xformers(True)  # とりあえずDiffusersのxformersを使う
+
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[-1], num_groups=norm_num_groups, eps=1e-6)
+        self.conv_act = nn.SiLU()
+
+        conv_out_channels = 2 * out_channels if double_z else out_channels
+        self.conv_out = nn.Conv2d(block_out_channels[-1], conv_out_channels, 3, padding=1)
+
+        # replace forward of ResBlocks
+        def wrapper(func, module, num_slices):
+            def forward(*args, **kwargs):
+                return func(module, num_slices, *args, **kwargs)
+
+            return forward
+
+        self.num_slices = num_slices
+        div = num_slices / (2 ** (len(self.down_blocks) - 1))  # 深い層はそこまで分割しなくていいので適宜減らす
+        # print(f"initial divisor: {div}")
+        if div >= 2:
+            div = int(div)
+            for resnet in self.mid_block.resnets:
+                resnet.forward = wrapper(resblock_forward, resnet, div)
+            # midblock doesn't have downsample
+
+        for i, down_block in enumerate(self.down_blocks[::-1]):
+            if div >= 2:
+                div = int(div)
+                # print(f"down block: {i} divisor: {div}")
+                for resnet in down_block.resnets:
+                    resnet.forward = wrapper(resblock_forward, resnet, div)
+                if down_block.downsamplers is not None:
+                    # print("has downsample")
+                    for downsample in down_block.downsamplers:
+                        downsample.forward = wrapper(self.downsample_forward, downsample, div * 2)
+            div *= 2
+
+    def forward(self, x):
+        sample = x
+        del x
+
+        org_device = sample.device
+        cpu_device = torch.device("cpu")
+
+        # sample = self.conv_in(sample)
+        sample = sample.to(cpu_device)
+        sliced = slice_h(sample, self.num_slices)
+        del sample
+
+        for i in range(len(sliced)):
+            x = sliced[i]
+            sliced[i] = None
+
+            x = x.to(org_device)
+            x = self.conv_in(x)
+            x = x.to(cpu_device)
+            sliced[i] = x
+            del x
+
+        sample = cat_h(sliced)
+        del sliced
+
+        sample = sample.to(org_device)
+
+        # down
+        for down_block in self.down_blocks:
+            sample = down_block(sample)
+
+        # middle
+        sample = self.mid_block(sample)
+
+        # post-process
+        # ここも省メモリ化したいが、恐らくそこまでメモリを食わないので省略
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        return sample
+
+    def downsample_forward(self, _self, num_slices, hidden_states):
+        assert hidden_states.shape[1] == _self.channels
+        assert _self.use_conv and _self.padding == 0
+        print("downsample forward", num_slices, hidden_states.shape)
+
+        org_device = hidden_states.device
+        cpu_device = torch.device("cpu")
+
+        hidden_states = hidden_states.to(cpu_device)
+        pad = (0, 1, 0, 1)
+        hidden_states = torch.nn.functional.pad(hidden_states, pad, mode="constant", value=0)
+
+        # slice with even number because of stride 2
+        # strideが2なので偶数でスライスする
+        # slice with pad 1 both sides: to eliminate side effect of padding of conv2d
+        size = (hidden_states.shape[2] + num_slices - 1) // num_slices
+        size = size + 1 if size % 2 == 1 else size
+
+        sliced = []
+        for i in range(num_slices):
+            if i == 0:
+                sliced.append(hidden_states[:, :, : size + 1, :])
+            else:
+                end = size * (i + 1) + 1
+                if hidden_states.shape[2] - end < 4:  # if the last slice is too small, use the rest of the tensor
+                    end = hidden_states.shape[2]
+                sliced.append(hidden_states[:, :, size * i - 1 : end, :])
+                if end >= hidden_states.shape[2]:
+                    break
+        del hidden_states
+
+        for i in range(len(sliced)):
+            x = sliced[i]
+            sliced[i] = None
+
+            x = x.to(org_device)
+            x = _self.conv(x)
+            x = x.to(cpu_device)
+
+            # ここだけ雰囲気が違うのはCopilotのせい
+            if i == 0:
+                hidden_states = x
+            else:
+                hidden_states = torch.cat([hidden_states, x], dim=2)
+
+        hidden_states = hidden_states.to(org_device)
+        # print("downsample forward done", hidden_states.shape)
+        return hidden_states
+
+
+class SlicingDecoder(nn.Module):
+    def __init__(
+        self,
+        in_channels=3,
+        out_channels=3,
+        up_block_types=("UpDecoderBlock2D",),
+        block_out_channels=(64,),
+        layers_per_block=2,
+        norm_num_groups=32,
+        act_fn="silu",
+        num_slices=2,
+    ):
+        super().__init__()
+        self.layers_per_block = layers_per_block
+
+        self.conv_in = nn.Conv2d(in_channels, block_out_channels[-1], kernel_size=3, stride=1, padding=1)
+
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+
+        # mid
+        self.mid_block = UNetMidBlock2D(
+            in_channels=block_out_channels[-1],
+            resnet_eps=1e-6,
+            resnet_act_fn=act_fn,
+            output_scale_factor=1,
+            resnet_time_scale_shift="default",
+            attn_num_head_channels=None,
+            resnet_groups=norm_num_groups,
+            temb_channels=None,
+        )
+        self.mid_block.attentions[0].set_use_memory_efficient_attention_xformers(True)  # とりあえずDiffusersのxformersを使う
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=self.layers_per_block + 1,
+                in_channels=prev_output_channel,
+                out_channels=output_channel,
+                prev_output_channel=None,
+                add_upsample=not is_final_block,
+                resnet_eps=1e-6,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                attn_num_head_channels=None,
+                temb_channels=None,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=1e-6)
+        self.conv_act = nn.SiLU()
+        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)
+
+        # replace forward of ResBlocks
+        def wrapper(func, module, num_slices):
+            def forward(*args, **kwargs):
+                return func(module, num_slices, *args, **kwargs)
+
+            return forward
+
+        self.num_slices = num_slices
+        div = num_slices / (2 ** (len(self.up_blocks) - 1))
+        print(f"initial divisor: {div}")
+        if div >= 2:
+            div = int(div)
+            for resnet in self.mid_block.resnets:
+                resnet.forward = wrapper(resblock_forward, resnet, div)
+            # midblock doesn't have upsample
+
+        for i, up_block in enumerate(self.up_blocks):
+            if div >= 2:
+                div = int(div)
+                # print(f"up block: {i} divisor: {div}")
+                for resnet in up_block.resnets:
+                    resnet.forward = wrapper(resblock_forward, resnet, div)
+                if up_block.upsamplers is not None:
+                    # print("has upsample")
+                    for upsample in up_block.upsamplers:
+                        upsample.forward = wrapper(self.upsample_forward, upsample, div * 2)
+            div *= 2
+
+    def forward(self, z):
+        sample = z
+        del z
+        sample = self.conv_in(sample)
+
+        # middle
+        sample = self.mid_block(sample)
+
+        # up
+        for i, up_block in enumerate(self.up_blocks):
+            sample = up_block(sample)
+
+        # post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+
+        # conv_out with slicing because of VRAM usage
+        # conv_outはとてもVRAM使うのでスライスして対応
+        org_device = sample.device
+        cpu_device = torch.device("cpu")
+        sample = sample.to(cpu_device)
+
+        sliced = slice_h(sample, self.num_slices)
+        del sample
+        for i in range(len(sliced)):
+            x = sliced[i]
+            sliced[i] = None
+
+            x = x.to(org_device)
+            x = self.conv_out(x)
+            x = x.to(cpu_device)
+            sliced[i] = x
+        sample = cat_h(sliced)
+        del sliced
+
+        sample = sample.to(org_device)
+        return sample
+
+    def upsample_forward(self, _self, num_slices, hidden_states, output_size=None):
+        assert hidden_states.shape[1] == _self.channels
+        assert _self.use_conv_transpose == False and _self.use_conv
+
+        org_dtype = hidden_states.dtype
+        org_device = hidden_states.device
+        cpu_device = torch.device("cpu")
+
+        hidden_states = hidden_states.to(cpu_device)
+        sliced = slice_h(hidden_states, num_slices)
+        del hidden_states
+
+        for i in range(len(sliced)):
+            x = sliced[i]
+            sliced[i] = None
+
+            x = x.to(org_device)
+
+            # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
+            # TODO(Suraj): Remove this cast once the issue is fixed in PyTorch
+            # https://github.com/pytorch/pytorch/issues/86679
+            # PyTorch 2で直らないかね……
+            if org_dtype == torch.bfloat16:
+                x = x.to(torch.float32)
+
+            x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+
+            if org_dtype == torch.bfloat16:
+                x = x.to(org_dtype)
+
+            x = _self.conv(x)
+
+            # upsampleされてるのでpadは2になる
+            if i == 0:
+                x = x[:, :, :-2, :]
+            elif i == num_slices - 1:
+                x = x[:, :, 2:, :]
+            else:
+                x = x[:, :, 2:-2, :]
+
+            x = x.to(cpu_device)
+            sliced[i] = x
+            del x
+
+        hidden_states = torch.cat(sliced, dim=2)
+        # print("us hidden_states", hidden_states.shape)
+        del sliced
+
+        hidden_states = hidden_states.to(org_device)
+        return hidden_states
+
+
+class SlicingAutoencoderKL(ModelMixin, ConfigMixin):
+    r"""Variational Autoencoder (VAE) model with KL loss from the paper Auto-Encoding Variational Bayes by Diederik P. Kingma
+    and Max Welling.
+
+    This model inherits from [`ModelMixin`]. Check the superclass documentation for the generic methods the library
+    implements for all the model (such as downloading or saving, etc.)
+
+    Parameters:
+        in_channels (int, *optional*, defaults to 3): Number of channels in the input image.
+        out_channels (int,  *optional*, defaults to 3): Number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to :
+            obj:`("DownEncoderBlock2D",)`): Tuple of downsample block types.
+        up_block_types (`Tuple[str]`, *optional*, defaults to :
+            obj:`("UpDecoderBlock2D",)`): Tuple of upsample block types.
+        block_out_channels (`Tuple[int]`, *optional*, defaults to :
+            obj:`(64,)`): Tuple of block output channels.
+        act_fn (`str`, *optional*, defaults to `"silu"`): The activation function to use.
+        latent_channels (`int`, *optional*, defaults to `4`): Number of channels in the latent space.
+        sample_size (`int`, *optional*, defaults to `32`): TODO
+    """
+
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
+        up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
+        block_out_channels: Tuple[int] = (64,),
+        layers_per_block: int = 1,
+        act_fn: str = "silu",
+        latent_channels: int = 4,
+        norm_num_groups: int = 32,
+        sample_size: int = 32,
+        num_slices: int = 16,
+    ):
+        super().__init__()
+
+        # pass init params to Encoder
+        self.encoder = SlicingEncoder(
+            in_channels=in_channels,
+            out_channels=latent_channels,
+            down_block_types=down_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            norm_num_groups=norm_num_groups,
+            double_z=True,
+            num_slices=num_slices,
+        )
+
+        # pass init params to Decoder
+        self.decoder = SlicingDecoder(
+            in_channels=latent_channels,
+            out_channels=out_channels,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            norm_num_groups=norm_num_groups,
+            act_fn=act_fn,
+            num_slices=num_slices,
+        )
+
+        self.quant_conv = torch.nn.Conv2d(2 * latent_channels, 2 * latent_channels, 1)
+        self.post_quant_conv = torch.nn.Conv2d(latent_channels, latent_channels, 1)
+        self.use_slicing = False
+
+    def encode(self, x: torch.FloatTensor, return_dict: bool = True) -> AutoencoderKLOutput:
+        h = self.encoder(x)
+        moments = self.quant_conv(h)
+        posterior = DiagonalGaussianDistribution(moments)
+
+        if not return_dict:
+            return (posterior,)
+
+        return AutoencoderKLOutput(latent_dist=posterior)
+
+    def _decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+        z = self.post_quant_conv(z)
+        dec = self.decoder(z)
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
+
+    # これはバッチ方向のスライシング　紛らわしい
+    def enable_slicing(self):
+        r"""
+        Enable sliced VAE decoding.
+
+        When this option is enabled, the VAE will split the input tensor in slices to compute decoding in several
+        steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.use_slicing = True
+
+    def disable_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_slicing` was previously invoked, this method will go back to computing
+        decoding in one step.
+        """
+        self.use_slicing = False
+
+    def decode(self, z: torch.FloatTensor, return_dict: bool = True) -> Union[DecoderOutput, torch.FloatTensor]:
+        if self.use_slicing and z.shape[0] > 1:
+            decoded_slices = [self._decode(z_slice).sample for z_slice in z.split(1)]
+            decoded = torch.cat(decoded_slices)
+        else:
+            decoded = self._decode(z).sample
+
+        if not return_dict:
+            return (decoded,)
+
+        return DecoderOutput(sample=decoded)
+
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        generator: Optional[torch.Generator] = None,
+    ) -> Union[DecoderOutput, torch.FloatTensor]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): Input sample.
+            sample_posterior (`bool`, *optional*, defaults to `False`):
+                Whether to sample from the posterior.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`DecoderOutput`] instead of a plain tuple.
+        """
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+        dec = self.decode(z).sample
+
+        if not return_dict:
+            return (dec,)
+
+        return DecoderOutput(sample=dec)
diff --git a/library/svd_merge_lora_gui.py b/library/svd_merge_lora_gui.py
index 042be2ecc..a62bafbad 100644
--- a/library/svd_merge_lora_gui.py
+++ b/library/svd_merge_lora_gui.py
@@ -71,7 +71,7 @@ def svd_merge_lora(
 ###
 
 
-def gradio_svd_merge_lora_tab():
+def gradio_svd_merge_lora_tab(headless=False):
     with gr.Tab('Merge LoRA (SVD)'):
         gr.Markdown('This utility can merge two LoRA networks together.')
 
@@ -85,7 +85,9 @@ def gradio_svd_merge_lora_tab():
                 interactive=True,
             )
             button_lora_a_model_file = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             button_lora_a_model_file.click(
                 get_file_path,
@@ -100,7 +102,9 @@ def gradio_svd_merge_lora_tab():
                 interactive=True,
             )
             button_lora_b_model_file = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             button_lora_b_model_file.click(
                 get_file_path,
@@ -141,7 +145,9 @@ def gradio_svd_merge_lora_tab():
                 interactive=True,
             )
             button_save_to = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             button_save_to.click(
                 get_saveasfilename_path,
diff --git a/library/train_util.py b/library/train_util.py
index 8c6e34371..d963537db 100644
--- a/library/train_util.py
+++ b/library/train_util.py
@@ -19,6 +19,7 @@
     Union,
 )
 from accelerate import Accelerator
+import gc
 import glob
 import math
 import os
@@ -30,6 +31,7 @@
 
 from tqdm import tqdm
 import torch
+from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.optim import Optimizer
 from torchvision import transforms
 from transformers import CLIPTokenizer
@@ -346,6 +348,8 @@ def __init__(
         self.is_reg = is_reg
         self.class_tokens = class_tokens
         self.caption_extension = caption_extension
+        if self.caption_extension and not self.caption_extension.startswith("."):
+            self.caption_extension = "." + self.caption_extension
 
     def __eq__(self, other) -> bool:
         if not isinstance(other, DreamBoothSubset):
@@ -1079,16 +1083,37 @@ def load_dreambooth_dir(subset: DreamBoothSubset):
 
             # 画像ファイルごとにプロンプトを読み込み、もしあればそちらを使う
             captions = []
+            missing_captions = []
             for img_path in img_paths:
                 cap_for_img = read_caption(img_path, subset.caption_extension)
                 if cap_for_img is None and subset.class_tokens is None:
-                    print(f"neither caption file nor class tokens are found. use empty caption for {img_path}")
+                    print(
+                        f"neither caption file nor class tokens are found. use empty caption for {img_path} / キャプションファイルもclass tokenも見つかりませんでした。空のキャプションを使用します: {img_path}"
+                    )
                     captions.append("")
+                    missing_captions.append(img_path)
                 else:
-                    captions.append(subset.class_tokens if cap_for_img is None else cap_for_img)
+                    if cap_for_img is None:
+                        captions.append(subset.class_tokens)
+                        missing_captions.append(img_path)
+                    else:
+                        captions.append(cap_for_img)
 
             self.set_tag_frequency(os.path.basename(subset.image_dir), captions)  # タグ頻度を記録
 
+            if missing_captions:
+                number_of_missing_captions = len(missing_captions)
+                number_of_missing_captions_to_show = 5
+                remaining_missing_captions = number_of_missing_captions - number_of_missing_captions_to_show
+
+                print(
+                    f"No caption file found for {number_of_missing_captions} images. Training will continue without captions for these images. If class token exists, it will be used. / {number_of_missing_captions}枚の画像にキャプションファイルが見つかりませんでした。これらの画像についてはキャプションなしで学習を続行します。class tokenが存在する場合はそれを使います。"
+                )
+                for i, missing_caption in enumerate(missing_captions):
+                    if i >= number_of_missing_captions_to_show:
+                        print(missing_caption + f"... and {remaining_missing_captions} more")
+                        break
+                    print(missing_caption)
             return img_paths, captions
 
         print("prepare images.")
@@ -1422,7 +1447,7 @@ def debug_dataset(train_dataset, show_input_ids=False):
 
     epoch = 1
     while True:
-        print(f"epoch: {epoch}")
+        print(f"\nepoch: {epoch}")
 
         steps = (epoch - 1) * len(train_dataset) + 1
         indices = list(range(len(train_dataset)))
@@ -1763,6 +1788,7 @@ def backward(ctx, do):
 
 
 def replace_unet_modules(unet: diffusers.models.unet_2d_condition.UNet2DConditionModel, mem_eff_attn, xformers):
+    # unet is not used currently, but it is here for future use
     if mem_eff_attn:
         replace_unet_cross_attn_to_memory_efficient()
     elif xformers:
@@ -1770,7 +1796,7 @@ def replace_unet_modules(unet: diffusers.models.unet_2d_condition.UNet2DConditio
 
 
 def replace_unet_cross_attn_to_memory_efficient():
-    print("Replace CrossAttention.forward to use FlashAttention (not xformers)")
+    print("CrossAttention.forward has been replaced to FlashAttention (not xformers)")
     flash_func = FlashAttentionFunction
 
     def forward_flash_attn(self, x, context=None, mask=None):
@@ -1810,7 +1836,7 @@ def forward_flash_attn(self, x, context=None, mask=None):
 
 
 def replace_unet_cross_attn_to_xformers():
-    print("Replace CrossAttention.forward to use xformers")
+    print("CrossAttention.forward has been replaced to enable xformers.")
     try:
         import xformers.ops
     except ImportError:
@@ -1852,6 +1878,60 @@ def forward_xformers(self, x, context=None, mask=None):
     diffusers.models.attention.CrossAttention.forward = forward_xformers
 
 
+"""
+def replace_vae_modules(vae: diffusers.models.AutoencoderKL, mem_eff_attn, xformers):
+    # vae is not used currently, but it is here for future use
+    if mem_eff_attn:
+        replace_vae_attn_to_memory_efficient()
+    elif xformers:
+        # とりあえずDiffusersのxformersを使う。AttentionがあるのはMidBlockのみ
+        print("Use Diffusers xformers for VAE")
+        vae.encoder.mid_block.attentions[0].set_use_memory_efficient_attention_xformers(True)
+        vae.decoder.mid_block.attentions[0].set_use_memory_efficient_attention_xformers(True)
+
+
+def replace_vae_attn_to_memory_efficient():
+    print("AttentionBlock.forward has been replaced to FlashAttention (not xformers)")
+    flash_func = FlashAttentionFunction
+
+    def forward_flash_attn(self, hidden_states):
+        print("forward_flash_attn")
+        q_bucket_size = 512
+        k_bucket_size = 1024
+
+        residual = hidden_states
+        batch, channel, height, width = hidden_states.shape
+
+        # norm
+        hidden_states = self.group_norm(hidden_states)
+
+        hidden_states = hidden_states.view(batch, channel, height * width).transpose(1, 2)
+
+        # proj to q, k, v
+        query_proj = self.query(hidden_states)
+        key_proj = self.key(hidden_states)
+        value_proj = self.value(hidden_states)
+
+        query_proj, key_proj, value_proj = map(
+            lambda t: rearrange(t, "b n (h d) -> b h n d", h=self.num_heads), (query_proj, key_proj, value_proj)
+        )
+
+        out = flash_func.apply(query_proj, key_proj, value_proj, None, False, q_bucket_size, k_bucket_size)
+
+        out = rearrange(out, "b h n d -> b n (h d)")
+
+        # compute next hidden_states
+        hidden_states = self.proj_attn(hidden_states)
+        hidden_states = hidden_states.transpose(-1, -2).reshape(batch, channel, height, width)
+
+        # res connect and rescale
+        hidden_states = (hidden_states + residual) / self.rescale_output_factor
+        return hidden_states
+
+    diffusers.models.attention.AttentionBlock.forward = forward_flash_attn
+"""
+
+
 # endregion
 
 
@@ -1883,7 +1963,7 @@ def add_optimizer_arguments(parser: argparse.ArgumentParser):
         "--optimizer_type",
         type=str,
         default="",
-        help="Optimizer to use / オプティマイザの種類: AdamW (default), AdamW8bit, Lion, SGDNesterov, SGDNesterov8bit, DAdaptation, AdaFactor",
+        help="Optimizer to use / オプティマイザの種類: AdamW (default), AdamW8bit, Lion8bit, Lion, SGDNesterov, SGDNesterov8bit, DAdaptation(DAdaptAdamPreprint), DAdaptAdaGrad, DAdaptAdam, DAdaptAdan, DAdaptAdanIP, DAdaptLion, DAdaptSGD, AdaFactor",
     )
 
     # backward compatibility
@@ -2119,6 +2199,30 @@ def add_training_arguments(parser: argparse.ArgumentParser, support_dreambooth:
         default=None,
         help="enable noise offset with this value (if enabled, around 0.1 is recommended) / Noise offsetを有効にしてこの値を設定する（有効にする場合は0.1程度を推奨）",
     )
+    parser.add_argument(
+        "--multires_noise_iterations",
+        type=int,
+        default=None,
+        help="enable multires noise with this number of iterations (if enabled, around 6-10 is recommended) / Multires noiseを有効にしてこのイテレーション数を設定する（有効にする場合は6-10程度を推奨）",
+    )
+    # parser.add_argument(
+    #     "--perlin_noise",
+    #     type=int,
+    #     default=None,
+    #     help="enable perlin noise and set the octaves / perlin noiseを有効にしてoctavesをこの値に設定する",
+    # )
+    parser.add_argument(
+        "--multires_noise_discount",
+        type=float,
+        default=0.3,
+        help="set discount value for multires noise (has no effect without --multires_noise_iterations) / Multires noiseのdiscount値を設定する（--multires_noise_iterations指定時のみ有効）",
+    )
+    parser.add_argument(
+        "--adaptive_noise_scale",
+        type=float,
+        default=None,
+        help="add `latent mean absolute value * this value` to noise_offset (disabled if None, default) / latentの平均値の絶対値 * この値をnoise_offsetに加算する（Noneの場合は無効、デフォルト）",
+    )
     parser.add_argument(
         "--lowram",
         action="store_true",
@@ -2191,6 +2295,22 @@ def verify_training_args(args: argparse.Namespace):
             "cache_latents_to_disk is enabled, so cache_latents is also enabled / cache_latents_to_diskが有効なため、cache_latentsを有効にします"
         )
 
+    # noise_offset, perlin_noise, multires_noise_iterations cannot be enabled at the same time
+    # Listを使って数えてもいいけど並べてしまえ
+    if args.noise_offset is not None and args.multires_noise_iterations is not None:
+        raise ValueError(
+            "noise_offset and multires_noise_iterations cannot be enabled at the same time / noise_offsetとmultires_noise_iterationsを同時に有効にできません"
+        )
+    # if args.noise_offset is not None and args.perlin_noise is not None:
+    #     raise ValueError("noise_offset and perlin_noise cannot be enabled at the same time / noise_offsetとperlin_noiseは同時に有効にできません")
+    # if args.perlin_noise is not None and args.multires_noise_iterations is not None:
+    #     raise ValueError(
+    #         "perlin_noise and multires_noise_iterations cannot be enabled at the same time / perlin_noiseとmultires_noise_iterationsを同時に有効にできません"
+    #     )
+
+    if args.adaptive_noise_scale is not None and args.noise_offset is None:
+        raise ValueError("adaptive_noise_scale requires noise_offset / adaptive_noise_scaleを使用するにはnoise_offsetが必要です")
+
 
 def add_dataset_arguments(
     parser: argparse.ArgumentParser, support_dreambooth: bool, support_caption: bool, support_caption_dropout: bool
@@ -2448,7 +2568,7 @@ def task():
 
 
 def get_optimizer(args, trainable_params):
-    # "Optimizer to use: AdamW, AdamW8bit, Lion, SGDNesterov, SGDNesterov8bit, DAdaptation, Adafactor"
+    # "Optimizer to use: AdamW, AdamW8bit, Lion, SGDNesterov, SGDNesterov8bit, Lion8bit, DAdaptation(DAdaptAdamPreprint), DAdaptAdaGrad, DAdaptAdam, DAdaptAdan, DAdaptAdanIP, DAdaptLion, DAdaptSGD, Adafactor"
 
     optimizer_type = args.optimizer_type
     if args.use_8bit_adam:
@@ -2526,6 +2646,22 @@ def get_optimizer(args, trainable_params):
         optimizer_class = lion_pytorch.Lion
         optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
 
+    elif optimizer_type == "Lion8bit".lower():
+        try:
+            import bitsandbytes as bnb
+        except ImportError:
+            raise ImportError("No bitsandbytes / bitsandbytesがインストールされていないようです")
+
+        print(f"use 8-bit Lion optimizer | {optimizer_kwargs}")
+        try:
+            optimizer_class = bnb.optim.Lion8bit
+        except AttributeError:
+            raise AttributeError(
+                "No Lion8bit. The version of bitsandbytes installed seems to be old. Please install 0.38.0 or later. / Lion8bitが定義されていません。インストールされているbitsandbytesのバージョンが古いようです。0.38.0以上をインストールしてください"
+            )
+
+        optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
+
     elif optimizer_type == "SGDNesterov".lower():
         print(f"use SGD with Nesterov optimizer | {optimizer_kwargs}")
         if "momentum" not in optimizer_kwargs:
@@ -2535,13 +2671,16 @@ def get_optimizer(args, trainable_params):
         optimizer_class = torch.optim.SGD
         optimizer = optimizer_class(trainable_params, lr=lr, nesterov=True, **optimizer_kwargs)
 
-    elif optimizer_type == "DAdaptation".lower():
+    elif optimizer_type.startswith("DAdapt".lower()):
+        # DAdaptation family
+        # check dadaptation is installed
         try:
             import dadaptation
+            import dadaptation.experimental as experimental
         except ImportError:
             raise ImportError("No dadaptation / dadaptation がインストールされていないようです")
-        print(f"use D-Adaptation Adam optimizer | {optimizer_kwargs}")
 
+        # check lr and lr_count, and print warning
         actual_lr = lr
         lr_count = 1
         if type(trainable_params) == list and type(trainable_params[0]) == dict:
@@ -2561,7 +2700,31 @@ def get_optimizer(args, trainable_params):
                 f"when multiple learning rates are specified with dadaptation (e.g. for Text Encoder and U-Net), only the first one will take effect / D-Adaptationで複数の学習率を指定した場合（Text EncoderとU-Netなど）、最初の学習率のみが有効になります: lr={actual_lr}"
             )
 
-        optimizer_class = dadaptation.DAdaptAdam
+        # set optimizer
+        if optimizer_type == "DAdaptation".lower() or optimizer_type == "DAdaptAdamPreprint".lower():
+            optimizer_class = experimental.DAdaptAdamPreprint
+            print(f"use D-Adaptation AdamPreprint optimizer | {optimizer_kwargs}")
+        elif optimizer_type == "DAdaptAdaGrad".lower():
+            optimizer_class = dadaptation.DAdaptAdaGrad
+            print(f"use D-Adaptation AdaGrad optimizer | {optimizer_kwargs}")
+        elif optimizer_type == "DAdaptAdam".lower():
+            optimizer_class = dadaptation.DAdaptAdam
+            print(f"use D-Adaptation Adam optimizer | {optimizer_kwargs}")
+        elif optimizer_type == "DAdaptAdan".lower():
+            optimizer_class = dadaptation.DAdaptAdan
+            print(f"use D-Adaptation Adan optimizer | {optimizer_kwargs}")
+        elif optimizer_type == "DAdaptAdanIP".lower():
+            optimizer_class = experimental.DAdaptAdanIP
+            print(f"use D-Adaptation AdanIP optimizer | {optimizer_kwargs}")
+        elif optimizer_type == "DAdaptLion".lower():
+            optimizer_class = dadaptation.DAdaptLion
+            print(f"use D-Adaptation Lion optimizer | {optimizer_kwargs}")
+        elif optimizer_type == "DAdaptSGD".lower():
+            optimizer_class = dadaptation.DAdaptSGD
+            print(f"use D-Adaptation SGD optimizer | {optimizer_kwargs}")
+        else:
+            raise ValueError(f"Unknown optimizer type: {optimizer_type}")
+
         optimizer = optimizer_class(trainable_params, lr=lr, **optimizer_kwargs)
 
     elif optimizer_type == "Adafactor".lower():
@@ -2850,16 +3013,16 @@ def prepare_dtype(args: argparse.Namespace):
     return weight_dtype, save_dtype
 
 
-def load_target_model(args: argparse.Namespace, weight_dtype, device="cpu"):
+def _load_target_model(args: argparse.Namespace, weight_dtype, device="cpu"):
     name_or_path = args.pretrained_model_name_or_path
     name_or_path = os.readlink(name_or_path) if os.path.islink(name_or_path) else name_or_path
     load_stable_diffusion_format = os.path.isfile(name_or_path)  # determine SD or Diffusers
     if load_stable_diffusion_format:
-        print("load StableDiffusion checkpoint")
+        print(f"load StableDiffusion checkpoint: {name_or_path}")
         text_encoder, vae, unet = model_util.load_models_from_stable_diffusion_checkpoint(args.v2, name_or_path, device)
     else:
         # Diffusers model is loaded to CPU
-        print("load Diffusers pretrained models")
+        print(f"load Diffusers pretrained models: {name_or_path}")
         try:
             pipe = StableDiffusionPipeline.from_pretrained(name_or_path, tokenizer=None, safety_checker=None)
         except EnvironmentError as ex:
@@ -2879,6 +3042,36 @@ def load_target_model(args: argparse.Namespace, weight_dtype, device="cpu"):
     return text_encoder, vae, unet, load_stable_diffusion_format
 
 
+def transform_if_model_is_DDP(text_encoder, unet, network=None):
+    # Transform text_encoder, unet and network from DistributedDataParallel
+    return (model.module if type(model) == DDP else model for model in [text_encoder, unet, network] if model is not None)
+
+
+def load_target_model(args, weight_dtype, accelerator):
+    # load models for each process
+    for pi in range(accelerator.state.num_processes):
+        if pi == accelerator.state.local_process_index:
+            print(f"loading model for process {accelerator.state.local_process_index}/{accelerator.state.num_processes}")
+
+            text_encoder, vae, unet, load_stable_diffusion_format = _load_target_model(
+                args, weight_dtype, accelerator.device if args.lowram else "cpu"
+            )
+
+            # work on low-ram device
+            if args.lowram:
+                text_encoder.to(accelerator.device)
+                unet.to(accelerator.device)
+                vae.to(accelerator.device)
+
+            gc.collect()
+            torch.cuda.empty_cache()
+        accelerator.wait_for_everyone()
+
+    text_encoder, unet = transform_if_model_is_DDP(text_encoder, unet)
+
+    return text_encoder, vae, unet, load_stable_diffusion_format
+
+
 def patch_accelerator_for_fp16_training(accelerator):
     org_unscale_grads = accelerator.scaler._unscale_grads_
 
@@ -3018,7 +3211,7 @@ def save_sd_model_on_epoch_end_or_stepwise(
             ckpt_name = get_step_ckpt_name(args, ext, global_step)
 
         ckpt_file = os.path.join(args.output_dir, ckpt_name)
-        print(f"saving checkpoint: {ckpt_file}")
+        print(f"\nsaving checkpoint: {ckpt_file}")
         model_util.save_stable_diffusion_checkpoint(
             args.v2, ckpt_file, text_encoder, unet, src_path, epoch_no, global_step, save_dtype, vae
         )
@@ -3044,7 +3237,7 @@ def save_sd_model_on_epoch_end_or_stepwise(
         else:
             out_dir = os.path.join(args.output_dir, STEP_DIFFUSERS_DIR_NAME.format(model_name, global_step))
 
-        print(f"saving model: {out_dir}")
+        print(f"\nsaving model: {out_dir}")
         model_util.save_diffusers_checkpoint(
             args.v2, out_dir, text_encoder, unet, src_path, vae=vae, use_safetensors=use_safetensors
         )
@@ -3062,16 +3255,17 @@ def save_sd_model_on_epoch_end_or_stepwise(
                 print(f"removing old model: {remove_out_dir}")
                 shutil.rmtree(remove_out_dir)
 
-    if on_epoch_end:
-        save_and_remove_state_on_epoch_end(args, accelerator, epoch_no)
-    else:
-        save_and_remove_state_stepwise(args, accelerator, global_step)
+    if args.save_state:
+        if on_epoch_end:
+            save_and_remove_state_on_epoch_end(args, accelerator, epoch_no)
+        else:
+            save_and_remove_state_stepwise(args, accelerator, global_step)
 
 
 def save_and_remove_state_on_epoch_end(args: argparse.Namespace, accelerator, epoch_no):
     model_name = default_if_none(args.output_name, DEFAULT_EPOCH_NAME)
 
-    print(f"saving state at epoch {epoch_no}")
+    print(f"\nsaving state at epoch {epoch_no}")
     os.makedirs(args.output_dir, exist_ok=True)
 
     state_dir = os.path.join(args.output_dir, EPOCH_STATE_NAME.format(model_name, epoch_no))
@@ -3092,7 +3286,7 @@ def save_and_remove_state_on_epoch_end(args: argparse.Namespace, accelerator, ep
 def save_and_remove_state_stepwise(args: argparse.Namespace, accelerator, step_no):
     model_name = default_if_none(args.output_name, DEFAULT_STEP_NAME)
 
-    print(f"saving state at step {step_no}")
+    print(f"\nsaving state at step {step_no}")
     os.makedirs(args.output_dir, exist_ok=True)
 
     state_dir = os.path.join(args.output_dir, STEP_STATE_NAME.format(model_name, step_no))
@@ -3117,7 +3311,7 @@ def save_and_remove_state_stepwise(args: argparse.Namespace, accelerator, step_n
 def save_state_on_train_end(args: argparse.Namespace, accelerator):
     model_name = default_if_none(args.output_name, DEFAULT_LAST_OUTPUT_NAME)
 
-    print("saving last state.")
+    print("\nsaving last state.")
     os.makedirs(args.output_dir, exist_ok=True)
 
     state_dir = os.path.join(args.output_dir, LAST_STATE_NAME.format(model_name))
@@ -3189,7 +3383,7 @@ def sample_images(
         if steps % args.sample_every_n_steps != 0 or epoch is not None:  # steps is not divisible or end of epoch
             return
 
-    print(f"generating sample images at step / サンプル画像生成 ステップ: {steps}")
+    print(f"\ngenerating sample images at step / サンプル画像生成 ステップ: {steps}")
     if not os.path.isfile(args.sample_prompts):
         print(f"No prompt file / プロンプトファイルがありません: {args.sample_prompts}")
         return
@@ -3198,8 +3392,21 @@ def sample_images(
     vae.to(device)
 
     # read prompts
-    with open(args.sample_prompts, "rt", encoding="utf-8") as f:
-        prompts = f.readlines()
+
+    # with open(args.sample_prompts, "rt", encoding="utf-8") as f:
+    #     prompts = f.readlines()
+
+    if args.sample_prompts.endswith(".txt"):
+        with open(args.sample_prompts, "r", encoding="utf-8") as f:
+            lines = f.readlines()
+        prompts = [line.strip() for line in lines if len(line.strip()) > 0 and line[0] != "#"]
+    elif args.sample_prompts.endswith(".toml"):
+        with open(args.sample_prompts, "r", encoding="utf-8") as f:
+            data = toml.load(f)
+        prompts = [dict(**data["prompt"], **subset) for subset in data["prompt"]["subset"]]
+    elif args.sample_prompts.endswith(".json"):
+        with open(args.sample_prompts, "r", encoding="utf-8") as f:
+            prompts = json.load(f)
 
     # schedulerを用意する
     sched_init_args = {}
@@ -3262,60 +3469,70 @@ def sample_images(
     os.makedirs(save_dir, exist_ok=True)
 
     rng_state = torch.get_rng_state()
-    cuda_rng_state = torch.cuda.get_rng_state()
+    cuda_rng_state = torch.cuda.get_rng_state() if torch.cuda.is_available() else None
 
     with torch.no_grad():
         with accelerator.autocast():
             for i, prompt in enumerate(prompts):
                 if not accelerator.is_main_process:
                     continue
-                prompt = prompt.strip()
-                if len(prompt) == 0 or prompt[0] == "#":
-                    continue
 
-                # subset of gen_img_diffusers
-                prompt_args = prompt.split(" --")
-                prompt = prompt_args[0]
-                negative_prompt = None
-                sample_steps = 30
-                width = height = 512
-                scale = 7.5
-                seed = None
-                for parg in prompt_args:
-                    try:
-                        m = re.match(r"w (\d+)", parg, re.IGNORECASE)
-                        if m:
-                            width = int(m.group(1))
-                            continue
-
-                        m = re.match(r"h (\d+)", parg, re.IGNORECASE)
-                        if m:
-                            height = int(m.group(1))
-                            continue
-
-                        m = re.match(r"d (\d+)", parg, re.IGNORECASE)
-                        if m:
-                            seed = int(m.group(1))
-                            continue
-
-                        m = re.match(r"s (\d+)", parg, re.IGNORECASE)
-                        if m:  # steps
-                            sample_steps = max(1, min(1000, int(m.group(1))))
-                            continue
-
-                        m = re.match(r"l ([\d\.]+)", parg, re.IGNORECASE)
-                        if m:  # scale
-                            scale = float(m.group(1))
-                            continue
-
-                        m = re.match(r"n (.+)", parg, re.IGNORECASE)
-                        if m:  # negative prompt
-                            negative_prompt = m.group(1)
-                            continue
-
-                    except ValueError as ex:
-                        print(f"Exception in parsing / 解析エラー: {parg}")
-                        print(ex)
+                if isinstance(prompt, dict):
+                    negative_prompt = prompt.get("negative_prompt")
+                    sample_steps = prompt.get("sample_steps", 30)
+                    width = prompt.get("width", 512)
+                    height = prompt.get("height", 512)
+                    scale = prompt.get("scale", 7.5)
+                    seed = prompt.get("seed")
+                    prompt = prompt.get("prompt")
+                else:
+                    # prompt = prompt.strip()
+                    # if len(prompt) == 0 or prompt[0] == "#":
+                    #     continue
+
+                    # subset of gen_img_diffusers
+                    prompt_args = prompt.split(" --")
+                    prompt = prompt_args[0]
+                    negative_prompt = None
+                    sample_steps = 30
+                    width = height = 512
+                    scale = 7.5
+                    seed = None
+                    for parg in prompt_args:
+                        try:
+                            m = re.match(r"w (\d+)", parg, re.IGNORECASE)
+                            if m:
+                                width = int(m.group(1))
+                                continue
+
+                            m = re.match(r"h (\d+)", parg, re.IGNORECASE)
+                            if m:
+                                height = int(m.group(1))
+                                continue
+
+                            m = re.match(r"d (\d+)", parg, re.IGNORECASE)
+                            if m:
+                                seed = int(m.group(1))
+                                continue
+
+                            m = re.match(r"s (\d+)", parg, re.IGNORECASE)
+                            if m:  # steps
+                                sample_steps = max(1, min(1000, int(m.group(1))))
+                                continue
+
+                            m = re.match(r"l ([\d\.]+)", parg, re.IGNORECASE)
+                            if m:  # scale
+                                scale = float(m.group(1))
+                                continue
+
+                            m = re.match(r"n (.+)", parg, re.IGNORECASE)
+                            if m:  # negative prompt
+                                negative_prompt = m.group(1)
+                                continue
+
+                        except ValueError as ex:
+                            print(f"Exception in parsing / 解析エラー: {parg}")
+                            print(ex)
 
                 if seed is not None:
                     torch.manual_seed(seed)
@@ -3369,7 +3586,8 @@ def sample_images(
     torch.cuda.empty_cache()
 
     torch.set_rng_state(rng_state)
-    torch.cuda.set_rng_state(cuda_rng_state)
+    if cuda_rng_state is not None:
+        torch.cuda.set_rng_state(cuda_rng_state)
     vae.to(org_vae_device)
 
 
diff --git a/library/utilities.py b/library/utilities.py
index 8c45bff6d..611bd06ab 100644
--- a/library/utilities.py
+++ b/library/utilities.py
@@ -11,6 +11,7 @@
 from library.blip_caption_gui import gradio_blip_caption_gui_tab
 from library.git_caption_gui import gradio_git_caption_gui_tab
 from library.wd14_caption_gui import gradio_wd14_caption_gui_tab
+from library.group_images_gui import gradio_group_images_gui_tab
 
 
 def utilities_tab(
@@ -20,13 +21,15 @@ def utilities_tab(
     logging_dir_input=gr.Textbox(),
     enable_copy_info_button=bool(False),
     enable_dreambooth_tab=True,
+    headless=False
 ):
     with gr.Tab('Captioning'):
-        gradio_basic_caption_gui_tab()
-        gradio_blip_caption_gui_tab()
-        gradio_git_caption_gui_tab()
-        gradio_wd14_caption_gui_tab()
-    gradio_convert_model_tab()
+        gradio_basic_caption_gui_tab(headless=headless)
+        gradio_blip_caption_gui_tab(headless=headless)
+        gradio_git_caption_gui_tab(headless=headless)
+        gradio_wd14_caption_gui_tab(headless=headless)
+    gradio_convert_model_tab(headless=headless)
+    gradio_group_images_gui_tab(headless=headless)
 
     return (
         train_data_dir_input,
diff --git a/library/verify_lora_gui.py b/library/verify_lora_gui.py
index a7a0bf9ef..bc3db5be3 100644
--- a/library/verify_lora_gui.py
+++ b/library/verify_lora_gui.py
@@ -50,7 +50,7 @@ def verify_lora(
 ###
 
 
-def gradio_verify_lora_tab():
+def gradio_verify_lora_tab(headless=False):
     with gr.Tab('Verify LoRA'):
         gr.Markdown(
             'This utility can verify a LoRA network to make sure it is properly trained.'
@@ -66,7 +66,9 @@ def gradio_verify_lora_tab():
                 interactive=True,
             )
             button_lora_model_file = gr.Button(
-                folder_symbol, elem_id='open_folder_small'
+                folder_symbol,
+                elem_id='open_folder_small',
+                visible=(not headless),
             )
             button_lora_model_file.click(
                 get_file_path,
diff --git a/library/wd14_caption_gui.py b/library/wd14_caption_gui.py
index 1970849bd..74a58aa3e 100644
--- a/library/wd14_caption_gui.py
+++ b/library/wd14_caption_gui.py
@@ -5,24 +5,20 @@
 import os
 
 
-def replace_underscore_with_space(folder_path, file_extension):
-    for file_name in os.listdir(folder_path):
-        if file_name.endswith(file_extension):
-            file_path = os.path.join(folder_path, file_name)
-            with open(file_path, 'r') as file:
-                file_content = file.read()
-            new_file_content = file_content.replace('_', ' ')
-            with open(file_path, 'w') as file:
-                file.write(new_file_content)
-
 def caption_images(
-    train_data_dir, caption_extension, batch_size, thresh, replace_underscores
+    train_data_dir,
+    caption_extension,
+    batch_size,
+    general_threshold,
+    character_threshold,
+    replace_underscores,
+    model,
+    recursive,
+    max_data_loader_n_workers,
+    debug,
+    undesired_tags,
+    frequency_tags,
 ):
-    # Check for caption_text_input
-    # if caption_text_input == "":
-    #     msgbox("Caption text is missing...")
-    #     return
-
     # Check for images_dir_input
     if train_data_dir == '':
         msgbox('Image folder is missing...')
@@ -34,9 +30,26 @@ def caption_images(
 
     print(f'Captioning files in {train_data_dir}...')
     run_cmd = f'accelerate launch "./finetune/tag_images_by_wd14_tagger.py"'
-    run_cmd += f' --batch_size="{int(batch_size)}"'
-    run_cmd += f' --thresh="{thresh}"'
+    run_cmd += f' --batch_size={int(batch_size)}'
+    run_cmd += f' --general_threshold={general_threshold}'
+    run_cmd += f' --character_threshold={character_threshold}'
     run_cmd += f' --caption_extension="{caption_extension}"'
+    run_cmd += f' --model="{model}"'
+    run_cmd += (
+        f' --max_data_loader_n_workers="{int(max_data_loader_n_workers)}"'
+    )
+
+    if recursive:
+        run_cmd += f' --recursive'
+    if debug:
+        run_cmd += f' --debug'
+    if replace_underscores:
+        run_cmd += f' --remove_underscore'
+    if frequency_tags:
+        run_cmd += f' --frequency_tags'
+
+    if not undesired_tags == '':
+        run_cmd += f' --undesired_tags="{undesired_tags}"'
     run_cmd += f' "{train_data_dir}"'
 
     print(run_cmd)
@@ -46,9 +59,6 @@ def caption_images(
         os.system(run_cmd)
     else:
         subprocess.run(run_cmd)
-        
-    if replace_underscores:
-        replace_underscore_with_space(train_data_dir, caption_extension)
 
     print('...captioning done')
 
@@ -58,11 +68,14 @@ def caption_images(
 ###
 
 
-def gradio_wd14_caption_gui_tab():
+def gradio_wd14_caption_gui_tab(headless=False):
     with gr.Tab('WD14 Captioning'):
         gr.Markdown(
             'This utility will use WD14 to caption files for each images in a folder.'
         )
+
+        # Input Settings
+        # with gr.Section('Input Settings'):
         with gr.Row():
             train_data_dir = gr.Textbox(
                 label='Image folder to caption',
@@ -70,7 +83,7 @@ def gradio_wd14_caption_gui_tab():
                 interactive=True,
             )
             button_train_data_dir_input = gr.Button(
-                '📂', elem_id='open_folder_small'
+                '📂', elem_id='open_folder_small', visible=(not headless)
             )
             button_train_data_dir_input.click(
                 get_folder_path,
@@ -84,17 +97,75 @@ def gradio_wd14_caption_gui_tab():
                 value='.txt',
                 interactive=True,
             )
-            thresh = gr.Number(value=0.35, label='Threshold')
 
-            batch_size = gr.Number(
-                value=1, label='Batch size', interactive=True
-            )
+        undesired_tags = gr.Textbox(
+            label='Undesired tags',
+            placeholder='(Optional) Separate `undesired_tags` with comma `(,)` if you want to remove multiple tags, e.g. `1girl,solo,smile`.',
+            interactive=True,
+        )
 
+        with gr.Row():
             replace_underscores = gr.Checkbox(
                 label='Replace underscores in filenames with spaces',
-                value=False,
+                value=True,
                 interactive=True,
             )
+            recursive = gr.Checkbox(
+                label='Recursive',
+                value=False,
+                info='Tag subfolders images as well',
+            )
+
+            debug = gr.Checkbox(
+                label='Verbose logging',
+                value=True,
+                info='Debug while tagging, it will print your image file with general tags and character tags.',
+            )
+            frequency_tags = gr.Checkbox(
+                label='Show tags frequency',
+                value=True,
+                info='Show frequency of tags for images.',
+            )
+
+        # Model Settings
+        with gr.Row():
+            model = gr.Dropdown(
+                label='Model',
+                choices=[
+                    'SmilingWolf/wd-v1-4-convnext-tagger-v2',
+                    'SmilingWolf/wd-v1-4-convnextv2-tagger-v2',
+                    'SmilingWolf/wd-v1-4-vit-tagger-v2',
+                    'SmilingWolf/wd-v1-4-swinv2-tagger-v2',
+                ],
+                value='SmilingWolf/wd-v1-4-convnextv2-tagger-v2',
+            )
+
+            general_threshold = gr.Slider(
+                value=0.35,
+                label='General threshold',
+                info='Adjust `general_threshold` for pruning tags (less tags, less flexible)',
+                minimum=0,
+                maximum=1,
+                step=0.05,
+            )
+            character_threshold = gr.Slider(
+                value=0.35,
+                label='Character threshold',
+                info='useful if you want to train with characte',
+                minimum=0,
+                maximum=1,
+                step=0.05,
+            )
+
+        # Advanced Settings
+        with gr.Row():
+            batch_size = gr.Number(
+                value=8, label='Batch size', interactive=True
+            )
+
+            max_data_loader_n_workers = gr.Number(
+                value=2, label='Max dataloader workers', interactive=True
+            )
 
         caption_button = gr.Button('Caption images')
 
@@ -104,8 +175,15 @@ def gradio_wd14_caption_gui_tab():
                 train_data_dir,
                 caption_extension,
                 batch_size,
-                thresh,
+                general_threshold,
+                character_threshold,
                 replace_underscores,
+                model,
+                recursive,
+                max_data_loader_n_workers,
+                debug,
+                undesired_tags,
+                frequency_tags,
             ],
             show_progress=False,
         )
diff --git a/lora_gui.py b/lora_gui.py
index 256129ca4..958688733 100644
--- a/lora_gui.py
+++ b/lora_gui.py
@@ -6,6 +6,8 @@
 import gradio as gr
 import logging
 import time
+
+# import easygui
 import json
 import math
 import os
@@ -29,6 +31,7 @@
     # set_legacy_8bitadam,
     update_my_data,
     check_if_model_exist,
+    output_message,
 )
 from library.dreambooth_folder_creation_gui import (
     gradio_dreambooth_folder_creation_tab,
@@ -45,7 +48,8 @@
 from library.verify_lora_gui import gradio_verify_lora_tab
 from library.resize_lora_gui import gradio_resize_lora_tab
 from library.sampler_gui import sample_gradio_config, run_cmd_sample
-from easygui import msgbox
+
+# from easygui import msgbox
 
 folder_symbol = '\U0001f4c2'  # 📂
 refresh_symbol = '\U0001f504'  # 🔄
@@ -132,6 +136,7 @@ def save_configuration(
     seed,
     num_cpu_threads_per_process,
     cache_latents,
+    cache_latents_to_disk,
     caption_extension,
     enable_bucket,
     gradient_checkpointing,
@@ -148,7 +153,7 @@ def save_configuration(
     text_encoder_lr,
     unet_lr,
     network_dim,
-    lora_network_weights,
+    lora_network_weights,dim_from_weights,
     color_aug,
     flip_aug,
     clip_skip,
@@ -172,7 +177,9 @@ def save_configuration(
     caption_dropout_rate,
     optimizer,
     optimizer_args,
-    noise_offset,
+    noise_offset_type,noise_offset,adaptive_noise_scale,
+    multires_noise_iterations,
+    multires_noise_discount,
     LoRA_type,
     conv_dim,
     conv_alpha,
@@ -196,6 +203,8 @@ def save_configuration(
     save_every_n_steps,
     save_last_n_steps,
     save_last_n_steps_state,
+    use_wandb,
+    wandb_api_key,
 ):
     # Get list of function parameters and values
     parameters = list(locals().items())
@@ -264,6 +273,7 @@ def open_configuration(
     seed,
     num_cpu_threads_per_process,
     cache_latents,
+    cache_latents_to_disk,
     caption_extension,
     enable_bucket,
     gradient_checkpointing,
@@ -280,7 +290,7 @@ def open_configuration(
     text_encoder_lr,
     unet_lr,
     network_dim,
-    lora_network_weights,
+    lora_network_weights,dim_from_weights,
     color_aug,
     flip_aug,
     clip_skip,
@@ -304,7 +314,9 @@ def open_configuration(
     caption_dropout_rate,
     optimizer,
     optimizer_args,
-    noise_offset,
+    noise_offset_type,noise_offset,adaptive_noise_scale,
+    multires_noise_iterations,
+    multires_noise_discount,
     LoRA_type,
     conv_dim,
     conv_alpha,
@@ -328,6 +340,8 @@ def open_configuration(
     save_every_n_steps,
     save_last_n_steps,
     save_last_n_steps_state,
+    use_wandb,
+    wandb_api_key,
 ):
     # Get list of function parameters and values
     parameters = list(locals().items())
@@ -367,6 +381,7 @@ def open_configuration(
 
 
 def train_model(
+    headless,
     print_only,
     pretrained_model_name_or_path,
     v2,
@@ -387,6 +402,7 @@ def train_model(
     seed,
     num_cpu_threads_per_process,
     cache_latents,
+    cache_latents_to_disk,
     caption_extension,
     enable_bucket,
     gradient_checkpointing,
@@ -403,7 +419,7 @@ def train_model(
     text_encoder_lr,
     unet_lr,
     network_dim,
-    lora_network_weights,
+    lora_network_weights,dim_from_weights,
     color_aug,
     flip_aug,
     clip_skip,
@@ -427,7 +443,9 @@ def train_model(
     caption_dropout_rate,
     optimizer,
     optimizer_args,
-    noise_offset,
+    noise_offset_type,noise_offset,adaptive_noise_scale,
+    multires_noise_iterations,
+    multires_noise_discount,
     LoRA_type,
     conv_dim,
     conv_alpha,
@@ -451,51 +469,92 @@ def train_model(
     save_every_n_steps,
     save_last_n_steps,
     save_last_n_steps_state,
+    use_wandb,
+    wandb_api_key,
 ):
     print_only_bool = True if print_only.get('label') == 'True' else False
     log.info(f'Start training LoRA {LoRA_type} ...')
+    headless_bool = True if headless.get('label') == 'True' else False
 
     if pretrained_model_name_or_path == '':
-        msgbox('Source model information is missing')
+        output_message(
+            msg='Source model information is missing', headless=headless_bool
+        )
         return
 
     if train_data_dir == '':
-        msgbox('Image folder path is missing')
+        output_message(
+            msg='Image folder path is missing', headless=headless_bool
+        )
         return
 
     if not os.path.exists(train_data_dir):
-        msgbox('Image folder does not exist')
+        output_message(
+            msg='Image folder does not exist', headless=headless_bool
+        )
         return
 
     if reg_data_dir != '':
         if not os.path.exists(reg_data_dir):
-            msgbox('Regularisation folder does not exist')
+            output_message(
+                msg='Regularisation folder does not exist',
+                headless=headless_bool,
+            )
             return
 
     if output_dir == '':
-        msgbox('Output folder path is missing')
+        output_message(
+            msg='Output folder path is missing', headless=headless_bool
+        )
         return
 
     if int(bucket_reso_steps) < 1:
-        msgbox('Bucket resolution steps need to be greater than 0')
+        output_message(
+            msg='Bucket resolution steps need to be greater than 0',
+            headless=headless_bool,
+        )
+        return
+
+    if noise_offset == '':
+        noise_offset = 0
+
+    if float(noise_offset) > 1 or float(noise_offset) < 0:
+        output_message(
+            msg='Noise offset need to be a value between 0 and 1',
+            headless=headless_bool,
+        )
         return
 
+    # if float(noise_offset) > 0 and (
+    #     multires_noise_iterations > 0 or multires_noise_discount > 0
+    # ):
+    #     output_message(
+    #         msg="noise offset and multires_noise can't be set at the same time. Only use one or the other.",
+    #         title='Error',
+    #         headless=headless_bool,
+    #     )
+    #     return
+
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
 
     if stop_text_encoder_training_pct > 0:
-        msgbox(
-            'Output "stop text encoder training" is not yet supported. Ignoring'
+        output_message(
+            msg='Output "stop text encoder training" is not yet supported. Ignoring',
+            headless=headless_bool,
         )
         stop_text_encoder_training_pct = 0
 
-    if check_if_model_exist(output_name, output_dir, save_model_as):
+    if check_if_model_exist(
+        output_name, output_dir, save_model_as, headless=headless_bool
+    ):
         return
 
     if optimizer == 'Adafactor' and lr_warmup != '0':
-        msgbox(
-            "Warning: lr_scheduler is set to 'Adafactor', so 'LR warmup (% of steps)' will be considered 0.",
+        output_message(
+            msg="Warning: lr_scheduler is set to 'Adafactor', so 'LR warmup (% of steps)' will be considered 0.",
             title='Warning',
+            headless=headless_bool,
         )
         lr_warmup = '0'
 
@@ -505,12 +564,6 @@ def train_model(
     if unet_lr == '':
         unet_lr = 0
 
-    # if (float(text_encoder_lr) == 0) and (float(unet_lr) == 0):
-    #     msgbox(
-    #         'At least one Learning Rate value for "Text encoder" or "Unet" need to be provided'
-    #     )
-    #     return
-
     # Get a list of all subfolders in train_data_dir
     subfolders = [
         f
@@ -556,16 +609,31 @@ def train_model(
                 f"Error: '{folder}' does not contain an underscore, skipping..."
             )
 
+    if reg_data_dir == '':
+        reg_factor = 1
+    else:
+        print(
+            '\033[94mRegularisation images are used... Will double the number of steps required...\033[0m'
+        )
+        reg_factor = 2
+
+    print(f'Total steps: {total_steps}')
+    print(f'Train batch size: {train_batch_size}')
+    print(f'Gradient accumulation steps: {gradient_accumulation_steps}')
+    print(f'Epoch: {epoch}')
+    print(f'Regulatization factor: {reg_factor}')
+
     # calculate max_train_steps
     max_train_steps = int(
         math.ceil(
             float(total_steps)
             / int(train_batch_size)
+            / int(gradient_accumulation_steps)
             * int(epoch)
-            # * int(reg_factor)
+            * int(reg_factor)
         )
     )
-    log.info(f'max_train_steps = {max_train_steps}')
+    log.info(f'max_train_steps ({total_steps} / {train_batch_size} / {gradient_accumulation_steps} * {epoch} * {reg_factor}) = {max_train_steps}')
 
     # calculate stop encoder training
     if stop_text_encoder_training_pct == None:
@@ -599,7 +667,8 @@ def train_model(
         run_cmd += f' --reg_data_dir="{reg_data_dir}"'
     run_cmd += f' --resolution={max_resolution}'
     run_cmd += f' --output_dir="{output_dir}"'
-    run_cmd += f' --logging_dir="{logging_dir}"'
+    if not logging_dir == '':
+        run_cmd += f' --logging_dir="{logging_dir}"'
     run_cmd += f' --network_alpha="{network_alpha}"'
     if not training_comment == '':
         run_cmd += f' --training_comment="{training_comment}"'
@@ -704,14 +773,21 @@ def train_model(
             run_cmd += f' --unet_lr={unet_lr}'
             run_cmd += f' --network_train_unet_only'
     else:
-        if float(text_encoder_lr) == 0:
-            msgbox('Please input learning rate values.')
+        if float(learning_rate) == 0:
+            output_message(
+                msg='Please input learning rate values.',
+                headless=headless_bool,
+            )
             return
 
     run_cmd += f' --network_dim={network_dim}'
 
-    if not lora_network_weights == '':
-        run_cmd += f' --network_weights="{lora_network_weights}"'
+    if LoRA_type not in ['LyCORIS/LoCon', 'LyCORIS/LoHa']:
+        if not lora_network_weights == '':
+            run_cmd += f' --network_weights="{lora_network_weights}"'
+        if dim_from_weights:
+            run_cmd += f' --dim_from_weights'
+            
     if int(gradient_accumulation_steps) > 1:
         run_cmd += f' --gradient_accumulation_steps={int(gradient_accumulation_steps)}'
     if not output_name == '':
@@ -735,6 +811,7 @@ def train_model(
         seed=seed,
         caption_extension=caption_extension,
         cache_latents=cache_latents,
+        cache_latents_to_disk=cache_latents_to_disk,
         optimizer=optimizer,
         optimizer_args=optimizer_args,
     )
@@ -761,13 +838,19 @@ def train_model(
         bucket_reso_steps=bucket_reso_steps,
         caption_dropout_every_n_epochs=caption_dropout_every_n_epochs,
         caption_dropout_rate=caption_dropout_rate,
+        noise_offset_type=noise_offset_type,
         noise_offset=noise_offset,
+        adaptive_noise_scale=adaptive_noise_scale,
+        multires_noise_iterations=multires_noise_iterations,
+        multires_noise_discount=multires_noise_discount,
         additional_parameters=additional_parameters,
         vae_batch_size=vae_batch_size,
         min_snr_gamma=min_snr_gamma,
         save_every_n_steps=save_every_n_steps,
         save_last_n_steps=save_last_n_steps,
         save_last_n_steps_state=save_last_n_steps_state,
+        use_wandb=use_wandb,
+        wandb_api_key=wandb_api_key,
     )
 
     run_cmd += run_cmd_sample(
@@ -823,9 +906,12 @@ def lora_tab(
     reg_data_dir_input=gr.Textbox(),
     output_dir_input=gr.Textbox(),
     logging_dir_input=gr.Textbox(),
+    headless=False,
 ):
     dummy_db_true = gr.Label(value=True, visible=False)
     dummy_db_false = gr.Label(value=False, visible=False)
+    dummy_headless = gr.Label(value=headless, visible=False)
+
     gr.Markdown(
         'Train a custom model using kohya train network LoRA python code...'
     )
@@ -835,7 +921,7 @@ def lora_tab(
         button_save_as_config,
         config_file_name,
         button_load_config,
-    ) = gradio_config()
+    ) = gradio_config(headless=headless)
 
     (
         pretrained_model_name_or_path,
@@ -847,7 +933,8 @@ def lora_tab(
         save_model_as_choices=[
             'ckpt',
             'safetensors',
-        ]
+        ],
+        headless=headless,
     )
 
     with gr.Tab('Folders'):
@@ -856,7 +943,9 @@ def lora_tab(
                 label='Image folder',
                 placeholder='Folder where the training folders containing the images are located',
             )
-            train_data_dir_folder = gr.Button('📂', elem_id='open_folder_small')
+            train_data_dir_folder = gr.Button(
+                '📂', elem_id='open_folder_small', visible=(not headless)
+            )
             train_data_dir_folder.click(
                 get_folder_path,
                 outputs=train_data_dir,
@@ -866,7 +955,9 @@ def lora_tab(
                 label='Regularisation folder',
                 placeholder='(Optional) Folder where where the regularization folders containing the images are located',
             )
-            reg_data_dir_folder = gr.Button('📂', elem_id='open_folder_small')
+            reg_data_dir_folder = gr.Button(
+                '📂', elem_id='open_folder_small', visible=(not headless)
+            )
             reg_data_dir_folder.click(
                 get_folder_path,
                 outputs=reg_data_dir,
@@ -877,7 +968,9 @@ def lora_tab(
                 label='Output folder',
                 placeholder='Folder to output trained model',
             )
-            output_dir_folder = gr.Button('📂', elem_id='open_folder_small')
+            output_dir_folder = gr.Button(
+                '📂', elem_id='open_folder_small', visible=(not headless)
+            )
             output_dir_folder.click(
                 get_folder_path,
                 outputs=output_dir,
@@ -887,7 +980,9 @@ def lora_tab(
                 label='Logging folder',
                 placeholder='Optional: enable logging and output TensorBoard log to this folder',
             )
-            logging_dir_folder = gr.Button('📂', elem_id='open_folder_small')
+            logging_dir_folder = gr.Button(
+                '📂', elem_id='open_folder_small', visible=(not headless)
+            )
             logging_dir_folder.click(
                 get_folder_path,
                 outputs=logging_dir,
@@ -939,19 +1034,29 @@ def lora_tab(
                 ],
                 value='Standard',
             )
-            lora_network_weights = gr.Textbox(
-                label='LoRA network weights',
-                placeholder='{Optional) Path to existing LoRA network weights to resume training',
-            )
-            lora_network_weights_file = gr.Button(
-                document_symbol, elem_id='open_folder_small'
-            )
-            lora_network_weights_file.click(
-                get_any_file_path,
-                inputs=[lora_network_weights],
-                outputs=lora_network_weights,
-                show_progress=False,
-            )
+            with gr.Box():
+                with gr.Row():
+                    lora_network_weights = gr.Textbox(
+                        label='LoRA network weights',
+                        placeholder='(Optional)',
+                        info='Path to an existing LoRA network weights to resume training from'
+                    )
+                    lora_network_weights_file = gr.Button(
+                        document_symbol,
+                        elem_id='open_folder_small',
+                        visible=(not headless),
+                    )
+                    lora_network_weights_file.click(
+                        get_any_file_path,
+                        inputs=[lora_network_weights],
+                        outputs=lora_network_weights,
+                        show_progress=False,
+                    )
+                    dim_from_weights = gr.Checkbox(
+                        label='DIM from weights',
+                        value=False,
+                        info='Automatically determine the dim(rank) from the weight file.',
+                    )
         (
             learning_rate,
             lr_scheduler,
@@ -965,6 +1070,7 @@ def lora_tab(
             seed,
             caption_extension,
             cache_latents,
+            cache_latents_to_disk,
             optimizer,
             optimizer_args,
         ) = gradio_training(
@@ -974,15 +1080,15 @@ def lora_tab(
         )
 
         with gr.Row():
-            text_encoder_lr = gr.Textbox(
+            text_encoder_lr = gr.Number(
                 label='Text Encoder learning rate',
                 value='5e-5',
-                placeholder='Optional',
+                info='Optional',
             )
-            unet_lr = gr.Textbox(
+            unet_lr = gr.Number(
                 label='Unet learning rate',
                 value='0.0001',
-                placeholder='Optional',
+                info='Optional',
             )
             network_dim = gr.Slider(
                 minimum=1,
@@ -1048,6 +1154,14 @@ def update_LoRA_settings(LoRA_type):
                 'Kohya DyLoRA',
                 'Kohya LoCon',
             }
+            
+            # Determine if LoRA network weights should be visible based on LoRA_type
+            LoRA_network_weights_visible = LoRA_type in {
+                'Standard',
+                'LoCon',
+                'Kohya DyLoRA',
+                'Kohya LoCon',
+            }
 
             # Determine if kohya_dylora_visible should be visible based on LoRA_type
             kohya_dylora_visible = LoRA_type == 'Kohya DyLoRA'
@@ -1057,6 +1171,9 @@ def update_LoRA_settings(LoRA_type):
                 gr.Group.update(visible=LoCon_row),
                 gr.Group.update(visible=LoRA_type_change),
                 gr.Group.update(visible=kohya_dylora_visible),
+                gr.Textbox.update(visible=LoRA_network_weights_visible),
+                gr.Button.update(visible=LoRA_network_weights_visible),
+                gr.Checkbox.update(visible=LoRA_network_weights_visible),
             )
 
         with gr.Row():
@@ -1175,14 +1292,18 @@ def update_LoRA_settings(LoRA_type):
                 bucket_reso_steps,
                 caption_dropout_every_n_epochs,
                 caption_dropout_rate,
-                noise_offset,
+                noise_offset_type,noise_offset,adaptive_noise_scale,
+                multires_noise_iterations,
+                multires_noise_discount,
                 additional_parameters,
                 vae_batch_size,
                 min_snr_gamma,
                 save_every_n_steps,
                 save_last_n_steps,
                 save_last_n_steps_state,
-            ) = gradio_advanced_training()
+                use_wandb,
+                wandb_api_key,
+            ) = gradio_advanced_training(headless=headless)
             color_aug.change(
                 color_aug_changed,
                 inputs=[color_aug],
@@ -1199,7 +1320,7 @@ def update_LoRA_settings(LoRA_type):
         LoRA_type.change(
             update_LoRA_settings,
             inputs=[LoRA_type],
-            outputs=[LoCon_row, kohya_advanced_lora, kohya_dylora],
+            outputs=[LoCon_row, kohya_advanced_lora, kohya_dylora, lora_network_weights, lora_network_weights_file, dim_from_weights],
         )
 
     with gr.Tab('Tools'):
@@ -1211,12 +1332,13 @@ def update_LoRA_settings(LoRA_type):
             reg_data_dir_input=reg_data_dir,
             output_dir_input=output_dir,
             logging_dir_input=logging_dir,
+            headless=headless,
         )
-        gradio_dataset_balancing_tab()
-        gradio_merge_lora_tab()
-        gradio_svd_merge_lora_tab()
-        gradio_resize_lora_tab()
-        gradio_verify_lora_tab()
+        gradio_dataset_balancing_tab(headless=headless)
+        gradio_merge_lora_tab(headless=headless)
+        gradio_svd_merge_lora_tab(headless=headless)
+        gradio_resize_lora_tab(headless=headless)
+        gradio_verify_lora_tab(headless=headless)
 
     button_run = gr.Button('Train model', variant='primary')
 
@@ -1256,6 +1378,7 @@ def update_LoRA_settings(LoRA_type):
         seed,
         num_cpu_threads_per_process,
         cache_latents,
+        cache_latents_to_disk,
         caption_extension,
         enable_bucket,
         gradient_checkpointing,
@@ -1272,7 +1395,7 @@ def update_LoRA_settings(LoRA_type):
         text_encoder_lr,
         unet_lr,
         network_dim,
-        lora_network_weights,
+        lora_network_weights,dim_from_weights,
         color_aug,
         flip_aug,
         clip_skip,
@@ -1296,7 +1419,9 @@ def update_LoRA_settings(LoRA_type):
         caption_dropout_rate,
         optimizer,
         optimizer_args,
-        noise_offset,
+        noise_offset_type,noise_offset,adaptive_noise_scale,
+        multires_noise_iterations,
+        multires_noise_discount,
         LoRA_type,
         conv_dim,
         conv_alpha,
@@ -1320,6 +1445,8 @@ def update_LoRA_settings(LoRA_type):
         save_every_n_steps,
         save_last_n_steps,
         save_last_n_steps_state,
+        use_wandb,
+        wandb_api_key,
     ]
 
     button_open_config.click(
@@ -1352,13 +1479,13 @@ def update_LoRA_settings(LoRA_type):
 
     button_run.click(
         train_model,
-        inputs=[dummy_db_false] + settings_list,
+        inputs=[dummy_headless] + [dummy_db_false] + settings_list,
         show_progress=False,
     )
 
     button_print.click(
         train_model,
-        inputs=[dummy_db_true] + settings_list,
+        inputs=[dummy_headless] + [dummy_db_true] + settings_list,
         show_progress=False,
     )
 
@@ -1373,12 +1500,17 @@ def update_LoRA_settings(LoRA_type):
 def UI(**kwargs):
     css = ''
 
+    headless = kwargs.get('headless', False)
+    print(f'headless: {headless}')
+
     if os.path.exists('./style.css'):
         with open(os.path.join('./style.css'), 'r', encoding='utf8') as file:
             log.info('Load CSS...')
             css += file.read() + '\n'
 
-    interface = gr.Blocks(css=css)
+    interface = gr.Blocks(
+        css=css, title='Kohya_ss GUI', theme=gr.themes.Default()
+    )
 
     with interface:
         with gr.Tab('LoRA'):
@@ -1387,7 +1519,7 @@ def UI(**kwargs):
                 reg_data_dir_input,
                 output_dir_input,
                 logging_dir_input,
-            ) = lora_tab()
+            ) = lora_tab(headless=headless)
         with gr.Tab('Utilities'):
             utilities_tab(
                 train_data_dir_input=train_data_dir_input,
@@ -1395,21 +1527,27 @@ def UI(**kwargs):
                 output_dir_input=output_dir_input,
                 logging_dir_input=logging_dir_input,
                 enable_copy_info_button=True,
+                headless=headless,
             )
 
     # Show the interface
     launch_kwargs = {}
-    if not kwargs.get('username', None) == '':
-        launch_kwargs['auth'] = (
-            kwargs.get('username', None),
-            kwargs.get('password', None),
-        )
-    if kwargs.get('server_port', 0) > 0:
-        launch_kwargs['server_port'] = kwargs.get('server_port', 0)
-    if kwargs.get('inbrowser', False):
-        launch_kwargs['inbrowser'] = kwargs.get('inbrowser', False)
-    if kwargs.get('listen', True):
-        launch_kwargs['server_name'] = '0.0.0.0'
+    username = kwargs.get('username')
+    password = kwargs.get('password')
+    server_port = kwargs.get('server_port', 0)
+    inbrowser = kwargs.get('inbrowser', False)
+    share = kwargs.get('share', False)
+    server_name = kwargs.get('listen')
+
+    launch_kwargs['server_name'] = server_name
+    if username and password:
+        launch_kwargs['auth'] = (username, password)
+    if server_port > 0:
+        launch_kwargs['server_port'] = server_port
+    if inbrowser:
+        launch_kwargs['inbrowser'] = inbrowser
+    if share:
+        launch_kwargs['share'] = share
     log.info(launch_kwargs)
     interface.launch(**launch_kwargs)
 
@@ -1417,6 +1555,12 @@ def UI(**kwargs):
 if __name__ == '__main__':
     # torch.cuda.set_per_process_memory_fraction(0.48)
     parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--listen',
+        type=str,
+        default='127.0.0.1',
+        help='IP to listen on for connections to Gradio',
+    )
     parser.add_argument(
         '--username', type=str, default='', help='Username for authentication'
     )
@@ -1433,9 +1577,10 @@ def UI(**kwargs):
         '--inbrowser', action='store_true', help='Open in browser'
     )
     parser.add_argument(
-        '--listen',
-        action='store_true',
-        help='Launch gradio with server name 0.0.0.0, allowing LAN access',
+        '--share', action='store_true', help='Share the gradio UI'
+    )
+    parser.add_argument(
+        '--headless', action='store_true', help='Is the server headless'
     )
 
     args = parser.parse_args()
@@ -1445,4 +1590,7 @@ def UI(**kwargs):
         password=args.password,
         inbrowser=args.inbrowser,
         server_port=args.server_port,
+        share=args.share,
+        listen=args.listen,
+        headless=args.headless,
     )
diff --git a/networks/lora.py b/networks/lora.py
index 1a3935368..dd5f77ec7 100644
--- a/networks/lora.py
+++ b/networks/lora.py
@@ -68,6 +68,39 @@ def apply_to(self):
         self.org_module.forward = self.forward
         del self.org_module
 
+    def forward(self, x):
+        return self.org_forward(x) + self.lora_up(self.lora_down(x)) * self.multiplier * self.scale
+
+
+class LoRAInfModule(LoRAModule):
+    def __init__(self, lora_name, org_module: torch.nn.Module, multiplier=1.0, lora_dim=4, alpha=1):
+        super().__init__(lora_name, org_module, multiplier, lora_dim, alpha)
+
+        self.org_module_ref = [org_module]  # 後から参照できるように
+        self.enabled = True
+
+        # check regional or not by lora_name
+        self.text_encoder = False
+        if lora_name.startswith("lora_te_"):
+            self.regional = False
+            self.use_sub_prompt = True
+            self.text_encoder = True
+        elif "attn2_to_k" in lora_name or "attn2_to_v" in lora_name:
+            self.regional = False
+            self.use_sub_prompt = True
+        elif "time_emb" in lora_name:
+            self.regional = False
+            self.use_sub_prompt = False
+        else:
+            self.regional = True
+            self.use_sub_prompt = False
+
+        self.network: LoRANetwork = None
+
+    def set_network(self, network):
+        self.network = network
+
+    # freezeしてマージする
     def merge_to(self, sd, dtype, device):
         # get up/down weight
         up_weight = sd["lora_up.weight"].to(torch.float).to(device)
@@ -99,44 +132,45 @@ def merge_to(self, sd, dtype, device):
         org_sd["weight"] = weight.to(dtype)
         self.org_module.load_state_dict(org_sd)
 
-    def set_region(self, region):
-        self.region = region
-        self.region_mask = None
+    # 復元できるマージのため、このモジュールのweightを返す
+    def get_weight(self, multiplier=None):
+        if multiplier is None:
+            multiplier = self.multiplier
 
-    def forward(self, x):
-        return self.org_forward(x) + self.lora_up(self.lora_down(x)) * self.multiplier * self.scale
+        # get up/down weight from module
+        up_weight = self.lora_up.weight.to(torch.float)
+        down_weight = self.lora_down.weight.to(torch.float)
 
-
-class LoRAInfModule(LoRAModule):
-    def __init__(self, lora_name, org_module: torch.nn.Module, multiplier=1.0, lora_dim=4, alpha=1):
-        super().__init__(lora_name, org_module, multiplier, lora_dim, alpha)
-
-        # check regional or not by lora_name
-        self.text_encoder = False
-        if lora_name.startswith("lora_te_"):
-            self.regional = False
-            self.use_sub_prompt = True
-            self.text_encoder = True
-        elif "attn2_to_k" in lora_name or "attn2_to_v" in lora_name:
-            self.regional = False
-            self.use_sub_prompt = True
-        elif "time_emb" in lora_name:
-            self.regional = False
-            self.use_sub_prompt = False
+        # pre-calculated weight
+        if len(down_weight.size()) == 2:
+            # linear
+            weight = self.multiplier * (up_weight @ down_weight) * self.scale
+        elif down_weight.size()[2:4] == (1, 1):
+            # conv2d 1x1
+            weight = (
+                self.multiplier
+                * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
+                * self.scale
+            )
         else:
-            self.regional = True
-            self.use_sub_prompt = False
+            # conv2d 3x3
+            conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
+            weight = self.multiplier * conved * self.scale
 
-        self.network: LoRANetwork = None
+        return weight
 
-    def set_network(self, network):
-        self.network = network
+    def set_region(self, region):
+        self.region = region
+        self.region_mask = None
 
     def default_forward(self, x):
         # print("default_forward", self.lora_name, x.size())
         return self.org_forward(x) + self.lora_up(self.lora_down(x)) * self.multiplier * self.scale
 
     def forward(self, x):
+        if not self.enabled:
+            return self.org_forward(x)
+
         if self.network is None or self.network.sub_prompt_index is None:
             return self.default_forward(x)
         if not self.regional and not self.use_sub_prompt:
@@ -287,6 +321,35 @@ def to_out_forward(self, x):
         return out
 
 
+def parse_block_lr_kwargs(nw_kwargs):
+    down_lr_weight = nw_kwargs.get("down_lr_weight", None)
+    mid_lr_weight = nw_kwargs.get("mid_lr_weight", None)
+    up_lr_weight = nw_kwargs.get("up_lr_weight", None)
+
+    # 以上のいずれにも設定がない場合は無効としてNoneを返す
+    if down_lr_weight is None and mid_lr_weight is None and up_lr_weight is None:
+        return None, None, None
+
+    # extract learning rate weight for each block
+    if down_lr_weight is not None:
+        # if some parameters are not set, use zero
+        if "," in down_lr_weight:
+            down_lr_weight = [(float(s) if s else 0.0) for s in down_lr_weight.split(",")]
+
+    if mid_lr_weight is not None:
+        mid_lr_weight = float(mid_lr_weight)
+
+    if up_lr_weight is not None:
+        if "," in up_lr_weight:
+            up_lr_weight = [(float(s) if s else 0.0) for s in up_lr_weight.split(",")]
+
+    down_lr_weight, mid_lr_weight, up_lr_weight = get_block_lr_weight(
+        down_lr_weight, mid_lr_weight, up_lr_weight, float(nw_kwargs.get("block_lr_zero_threshold", 0.0))
+    )
+
+    return down_lr_weight, mid_lr_weight, up_lr_weight
+
+
 def create_network(multiplier, network_dim, network_alpha, vae, text_encoder, unet, **kwargs):
     if network_dim is None:
         network_dim = 4  # default
@@ -305,9 +368,7 @@ def create_network(multiplier, network_dim, network_alpha, vae, text_encoder, un
 
     # block dim/alpha/lr
     block_dims = kwargs.get("block_dims", None)
-    down_lr_weight = kwargs.get("down_lr_weight", None)
-    mid_lr_weight = kwargs.get("mid_lr_weight", None)
-    up_lr_weight = kwargs.get("up_lr_weight", None)
+    down_lr_weight, mid_lr_weight, up_lr_weight = parse_block_lr_kwargs(kwargs)
 
     # 以上のいずれかに指定があればblockごとのdim(rank)を有効にする
     if block_dims is not None or down_lr_weight is not None or mid_lr_weight is not None or up_lr_weight is not None:
@@ -319,22 +380,6 @@ def create_network(multiplier, network_dim, network_alpha, vae, text_encoder, un
             block_dims, block_alphas, network_dim, network_alpha, conv_block_dims, conv_block_alphas, conv_dim, conv_alpha
         )
 
-        # extract learning rate weight for each block
-        if down_lr_weight is not None:
-            # if some parameters are not set, use zero
-            if "," in down_lr_weight:
-                down_lr_weight = [(float(s) if s else 0.0) for s in down_lr_weight.split(",")]
-
-        if mid_lr_weight is not None:
-            mid_lr_weight = float(mid_lr_weight)
-
-        if up_lr_weight is not None:
-            if "," in up_lr_weight:
-                up_lr_weight = [(float(s) if s else 0.0) for s in up_lr_weight.split(",")]
-
-        down_lr_weight, mid_lr_weight, up_lr_weight = get_block_lr_weight(
-            down_lr_weight, mid_lr_weight, up_lr_weight, float(kwargs.get("block_lr_zero_threshold", 0.0))
-        )
 
         # remove block dim/alpha without learning rate
         block_dims, block_alphas, conv_block_dims, conv_block_alphas = remove_block_dims_and_alphas(
@@ -595,13 +640,19 @@ def create_network_from_weights(multiplier, file, vae, text_encoder, unet, weigh
     # support old LoRA without alpha
     for key in modules_dim.keys():
         if key not in modules_alpha:
-            modules_alpha = modules_dim[key]
+            modules_alpha[key] = modules_dim[key]
 
     module_class = LoRAInfModule if for_inference else LoRAModule
 
     network = LoRANetwork(
         text_encoder, unet, multiplier=multiplier, modules_dim=modules_dim, modules_alpha=modules_alpha, module_class=module_class
     )
+
+    # block lr
+    down_lr_weight, mid_lr_weight, up_lr_weight = parse_block_lr_kwargs(kwargs)
+    if up_lr_weight is not None or mid_lr_weight is not None or down_lr_weight is not None:
+        network.set_block_lr_weight(up_lr_weight, mid_lr_weight, down_lr_weight)
+
     return network, weights_sd
 
 
@@ -770,6 +821,10 @@ def apply_to(self, text_encoder, unet, apply_text_encoder=True, apply_unet=True)
             lora.apply_to()
             self.add_module(lora.lora_name, lora)
 
+    # マージできるかどうかを返す
+    def is_mergeable(self):
+        return True
+
     # TODO refactor to common function with apply_to
     def merge_to(self, text_encoder, unet, weights_sd, dtype, device):
         apply_text_encoder = apply_unet = False
@@ -798,7 +853,7 @@ def merge_to(self, text_encoder, unet, weights_sd, dtype, device):
 
         print(f"weights are merged")
 
-    # 層別学習率用に層ごとの学習率に対する倍率を定義する
+    # 層別学習率用に層ごとの学習率に対する倍率を定義する　引数の順番が逆だがとりあえず気にしない
     def set_block_lr_weight(
         self,
         up_lr_weight: List[float] = None,
@@ -956,3 +1011,40 @@ def resize_add(mh, mw):
             w = (w + 1) // 2
 
         self.mask_dic = mask_dic
+
+    def backup_weights(self):
+        # 重みのバックアップを行う
+        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
+        for lora in loras:
+            org_module = lora.org_module_ref[0]
+            if not hasattr(org_module, "_lora_org_weight"):
+                sd = org_module.state_dict()
+                org_module._lora_org_weight = sd["weight"].detach().clone()
+                org_module._lora_restored = True
+
+    def restore_weights(self):
+        # 重みのリストアを行う
+        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
+        for lora in loras:
+            org_module = lora.org_module_ref[0]
+            if not org_module._lora_restored:
+                sd = org_module.state_dict()
+                sd["weight"] = org_module._lora_org_weight
+                org_module.load_state_dict(sd)
+                org_module._lora_restored = True
+
+    def pre_calculation(self):
+        # 事前計算を行う
+        loras: List[LoRAInfModule] = self.text_encoder_loras + self.unet_loras
+        for lora in loras:
+            org_module = lora.org_module_ref[0]
+            sd = org_module.state_dict()
+
+            org_weight = sd["weight"]
+            lora_weight = lora.get_weight().to(org_weight.device, dtype=org_weight.dtype)
+            sd["weight"] = org_weight + lora_weight
+            assert sd["weight"].shape == org_weight.shape
+            org_module.load_state_dict(sd)
+
+            org_module._lora_restored = False
+            lora.enabled = False
diff --git a/networks/lora_interrogator.py b/networks/lora_interrogator.py
index beb251811..0dc066fd1 100644
--- a/networks/lora_interrogator.py
+++ b/networks/lora_interrogator.py
@@ -23,7 +23,7 @@ def interrogate(args):
   print(f"loading SD model: {args.sd_model}")
   args.pretrained_model_name_or_path = args.sd_model
   args.vae = None
-  text_encoder, vae, unet, _ = train_util.load_target_model(args,weights_dtype, DEVICE)
+  text_encoder, vae, unet, _ = train_util._load_target_model(args,weights_dtype, DEVICE)
 
   print(f"loading LoRA: {args.model}")
   network, weights_sd = lora.create_network_from_weights(1.0, args.model, vae, text_encoder, unet)
diff --git a/networks/merge_lora_old.py b/networks/merge_lora_old.py
index c4b6efce3..ffd6b2b40 100644
--- a/networks/merge_lora_old.py
+++ b/networks/merge_lora_old.py
@@ -148,13 +148,13 @@ def str_to_dtype(p):
 
     merge_to_sd_model(text_encoder, unet, args.models, args.ratios, merge_dtype)
 
-    print(f"saving SD model to: {args.save_to}")
+    print(f"\nsaving SD model to: {args.save_to}")
     model_util.save_stable_diffusion_checkpoint(args.v2, args.save_to, text_encoder, unet,
                                                 args.sd_model, 0, 0, save_dtype, vae)
   else:
     state_dict, _, _ = merge_lora_models(args.models, args.ratios, merge_dtype)
 
-    print(f"saving model to: {args.save_to}")
+    print(f"\nsaving model to: {args.save_to}")
     save_to_file(args.save_to, state_dict, state_dict, save_dtype)
 
 
diff --git a/requirements.txt b/requirements.txt
index c407768d5..29b2fb57e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,13 +1,17 @@
-accelerate==0.15.0
+# Some comments
+accelerate==0.18.0
 albumentations==1.3.0
 altair==4.2.2
-bitsandbytes==0.35.0
-dadaptation==1.5
+# https://github.com/bmaltais/bitsandbytes-windows-webui/raw/main/bitsandbytes-0.38.1-py3-none-any.whl; sys_platform == 'win32'
+# This next line is not an error but rather there to properly catch if the url based bitsandbytes was properly installed by the line above...
+bitsandbytes==0.35.0; sys_platform == 'win32'
+bitsandbytes==0.38.1; (sys_platform == "darwin" or sys_platform == "linux")
+dadaptation==3.1
 diffusers[torch]==0.10.2
 easygui==0.98.3
 einops==0.6.0
 ftfy==6.1.1
-gradio==3.27.0; sys_platform != 'darwin'
+gradio==3.32.0; sys_platform != 'darwin'
 gradio==3.23.0; sys_platform == 'darwin'
 lion-pytorch==0.0.6
 opencv-python==4.7.0.68
@@ -31,4 +35,4 @@ tensorflow==2.10.1; sys_platform != 'darwin'
 # For locon support
 lycoris_lora==0.1.4
 # for kohya_ss library
-.
\ No newline at end of file
+.
diff --git a/setup.bat b/setup.bat
index 9416f835e..cffcb70d7 100644
--- a/setup.bat
+++ b/setup.bat
@@ -21,8 +21,7 @@ echo [2] - No (recommanded for most)
 set /p uninstall_choice="Enter your choice (1 or 2): "
 
 if %uninstall_choice%==1 (
-    pip uninstall -y xformers
-    pip uninstall -y torch torchvision
+    pip uninstall -y xformers torch torchvision triton
 )
 
 echo Please choose the version of torch you want to install:
@@ -30,20 +29,30 @@ echo [1] - v1 (torch 1.12.1) (Recommended)
 echo [2] - v2 (torch 2.0.0) (Experimental)
 set /p choice="Enter your choice (1 or 2): "
 
+
+:: Only does this section to cleanup the old custom dll versions that we used to use. No longer needed now with the new bitsandbytes version
+pip uninstall -y bitsandbytes
+IF EXIST ".\venv\Lib\site-packages\bitsandbytes" (
+    rmdir .\venv\Lib\site-packages\bitsandbytes
+)
+:::::::::::::::::::::::::
+
 if %choice%==1 (
     pip install torch==1.12.1+cu116 torchvision==0.13.1+cu116 --extra-index-url https://download.pytorch.org/whl/cu116
     pip install --use-pep517 --upgrade -r requirements.txt
     pip install -U -I --no-deps https://github.com/C43H66N12O12S2/stable-diffusion-webui/releases/download/f/xformers-0.0.14.dev0-cp310-cp310-win_amd64.whl
 ) else (
-    pip install torch==2.0.0+cu118 torchvision==0.15.1+cu118 --extra-index-url https://download.pytorch.org/whl/cu118
+    pip install torch==2.0.1+cu118 torchvision==0.15.2+cu118 --extra-index-url https://download.pytorch.org/whl/cu118
     pip install --use-pep517 --upgrade -r requirements.txt
-    pip install --upgrade xformers==0.0.17
+    pip install --upgrade xformers==0.0.19
     rem pip install -U -I --no-deps https://files.pythonhosted.org/packages/d6/f7/02662286419a2652c899e2b3d1913c47723fc164b4ac06a85f769c291013/xformers-0.0.17rc482-cp310-cp310-win_amd64.whl
     pip install https://huggingface.co/r4ziel/xformers_pre_built/resolve/main/triton-2.0.0-cp310-cp310-win_amd64.whl
 )
 
-copy /y .\bitsandbytes_windows\*.dll .\venv\Lib\site-packages\bitsandbytes\
-copy /y .\bitsandbytes_windows\cextension.py .\venv\Lib\site-packages\bitsandbytes\cextension.py
-copy /y .\bitsandbytes_windows\main.py .\venv\Lib\site-packages\bitsandbytes\cuda_setup\main.py
+python.exe .\tools\update_bitsandbytes.py
+
+@REM copy /y .\bitsandbytes_windows\*.dll .\venv\Lib\site-packages\bitsandbytes\
+@REM copy /y .\bitsandbytes_windows\cextension.py .\venv\Lib\site-packages\bitsandbytes\cextension.py
+@REM copy /y .\bitsandbytes_windows\main.py .\venv\Lib\site-packages\bitsandbytes\cuda_setup\main.py
 
 accelerate config
diff --git a/textual_inversion_gui.py b/textual_inversion_gui.py
index 724f3f23e..9bb068d83 100644
--- a/textual_inversion_gui.py
+++ b/textual_inversion_gui.py
@@ -27,6 +27,7 @@
     # set_legacy_8bitadam,
     update_my_data,
     check_if_model_exist,
+    output_message,
 )
 from library.tensorboard_gui import (
     gradio_tensorboard,
@@ -38,7 +39,8 @@
 )
 from library.utilities import utilities_tab
 from library.sampler_gui import sample_gradio_config, run_cmd_sample
-from easygui import msgbox
+
+# from easygui import msgbox
 
 folder_symbol = '\U0001f4c2'  # 📂
 refresh_symbol = '\U0001f504'  # 🔄
@@ -68,6 +70,7 @@ def save_configuration(
     seed,
     num_cpu_threads_per_process,
     cache_latents,
+    cache_latents_to_disk,
     caption_extension,
     enable_bucket,
     gradient_checkpointing,
@@ -107,7 +110,9 @@ def save_configuration(
     caption_dropout_rate,
     optimizer,
     optimizer_args,
-    noise_offset,
+    noise_offset_type,noise_offset,adaptive_noise_scale,
+    multires_noise_iterations,
+    multires_noise_discount,
     sample_every_n_steps,
     sample_every_n_epochs,
     sample_sampler,
@@ -118,6 +123,8 @@ def save_configuration(
     save_every_n_steps,
     save_last_n_steps,
     save_last_n_steps_state,
+    use_wandb,
+    wandb_api_key,
 ):
     # Get list of function parameters and values
     parameters = list(locals().items())
@@ -186,6 +193,7 @@ def open_configuration(
     seed,
     num_cpu_threads_per_process,
     cache_latents,
+    cache_latents_to_disk,
     caption_extension,
     enable_bucket,
     gradient_checkpointing,
@@ -225,7 +233,9 @@ def open_configuration(
     caption_dropout_rate,
     optimizer,
     optimizer_args,
-    noise_offset,
+    noise_offset_type,noise_offset,adaptive_noise_scale,
+    multires_noise_iterations,
+    multires_noise_discount,
     sample_every_n_steps,
     sample_every_n_epochs,
     sample_sampler,
@@ -236,6 +246,8 @@ def open_configuration(
     save_every_n_steps,
     save_last_n_steps,
     save_last_n_steps_state,
+    use_wandb,
+    wandb_api_key,
 ):
     # Get list of function parameters and values
     parameters = list(locals().items())
@@ -267,6 +279,7 @@ def open_configuration(
 
 
 def train_model(
+    headless,
     pretrained_model_name_or_path,
     v2,
     v_parameterization,
@@ -286,6 +299,7 @@ def train_model(
     seed,
     num_cpu_threads_per_process,
     cache_latents,
+    cache_latents_to_disk,
     caption_extension,
     enable_bucket,
     gradient_checkpointing,
@@ -325,7 +339,9 @@ def train_model(
     caption_dropout_rate,
     optimizer,
     optimizer_args,
-    noise_offset,
+    noise_offset_type,noise_offset,adaptive_noise_scale,
+    multires_noise_iterations,
+    multires_noise_discount,
     sample_every_n_steps,
     sample_every_n_epochs,
     sample_sampler,
@@ -336,46 +352,74 @@ def train_model(
     save_every_n_steps,
     save_last_n_steps,
     save_last_n_steps_state,
+    use_wandb,
+    wandb_api_key,
 ):
+    headless_bool = True if headless.get('label') == 'True' else False
+
     if pretrained_model_name_or_path == '':
-        msgbox('Source model information is missing')
+        output_message(
+            msg='Source model information is missing', headless=headless_bool
+        )
         return
 
     if train_data_dir == '':
-        msgbox('Image folder path is missing')
+        output_message(
+            msg='Image folder path is missing', headless=headless_bool
+        )
         return
 
     if not os.path.exists(train_data_dir):
-        msgbox('Image folder does not exist')
+        output_message(
+            msg='Image folder does not exist', headless=headless_bool
+        )
         return
 
     if reg_data_dir != '':
         if not os.path.exists(reg_data_dir):
-            msgbox('Regularisation folder does not exist')
+            output_message(
+                msg='Regularisation folder does not exist',
+                headless=headless_bool,
+            )
             return
 
     if output_dir == '':
-        msgbox('Output folder path is missing')
+        output_message(
+            msg='Output folder path is missing', headless=headless_bool
+        )
         return
 
     if token_string == '':
-        msgbox('Token string is missing')
+        output_message(msg='Token string is missing', headless=headless_bool)
         return
 
     if init_word == '':
-        msgbox('Init word is missing')
+        output_message(msg='Init word is missing', headless=headless_bool)
         return
 
     if not os.path.exists(output_dir):
         os.makedirs(output_dir)
 
-    if check_if_model_exist(output_name, output_dir, save_model_as):
+    if check_if_model_exist(
+        output_name, output_dir, save_model_as, headless_bool
+    ):
         return
 
+    # if float(noise_offset) > 0 and (
+    #     multires_noise_iterations > 0 or multires_noise_discount > 0
+    # ):
+    #     output_message(
+    #         msg="noise offset and multires_noise can't be set at the same time. Only use one or the other.",
+    #         title='Error',
+    #         headless=headless_bool,
+    #     )
+    #     return
+
     if optimizer == 'Adafactor' and lr_warmup != '0':
-        msgbox(
-            "Warning: lr_scheduler is set to 'Adafactor', so 'LR warmup (% of steps)' will be considered 0.",
+        output_message(
+            msg="Warning: lr_scheduler is set to 'Adafactor', so 'LR warmup (% of steps)' will be considered 0.",
             title='Warning',
+            headless=headless_bool,
         )
         lr_warmup = '0'
 
@@ -431,6 +475,7 @@ def train_model(
             math.ceil(
                 float(total_steps)
                 / int(train_batch_size)
+                / int(gradient_accumulation_steps)
                 * int(epoch)
                 * int(reg_factor)
             )
@@ -469,7 +514,8 @@ def train_model(
         run_cmd += f' --reg_data_dir="{reg_data_dir}"'
     run_cmd += f' --resolution={max_resolution}'
     run_cmd += f' --output_dir="{output_dir}"'
-    run_cmd += f' --logging_dir="{logging_dir}"'
+    if not logging_dir == '':
+        run_cmd += f' --logging_dir="{logging_dir}"'
     if not stop_text_encoder_training == 0:
         run_cmd += (
             f' --stop_text_encoder_training={stop_text_encoder_training}'
@@ -507,6 +553,7 @@ def train_model(
         seed=seed,
         caption_extension=caption_extension,
         cache_latents=cache_latents,
+        cache_latents_to_disk=cache_latents_to_disk,
         optimizer=optimizer,
         optimizer_args=optimizer_args,
     )
@@ -533,13 +580,19 @@ def train_model(
         bucket_reso_steps=bucket_reso_steps,
         caption_dropout_every_n_epochs=caption_dropout_every_n_epochs,
         caption_dropout_rate=caption_dropout_rate,
+        noise_offset_type=noise_offset_type,
         noise_offset=noise_offset,
+        adaptive_noise_scale=adaptive_noise_scale,
+        multires_noise_iterations=multires_noise_iterations,
+        multires_noise_discount=multires_noise_discount,
         additional_parameters=additional_parameters,
         vae_batch_size=vae_batch_size,
         min_snr_gamma=min_snr_gamma,
         save_every_n_steps=save_every_n_steps,
         save_last_n_steps=save_last_n_steps,
         save_last_n_steps_state=save_last_n_steps_state,
+        use_wandb=use_wandb,
+        wandb_api_key=wandb_api_key,
     )
     run_cmd += f' --token_string="{token_string}"'
     run_cmd += f' --init_word="{init_word}"'
@@ -580,9 +633,11 @@ def ti_tab(
     reg_data_dir=gr.Textbox(),
     output_dir=gr.Textbox(),
     logging_dir=gr.Textbox(),
+    headless=False,
 ):
     dummy_db_true = gr.Label(value=True, visible=False)
     dummy_db_false = gr.Label(value=False, visible=False)
+    dummy_headless = gr.Label(value=headless, visible=False)
     gr.Markdown('Train a TI using kohya textual inversion python code...')
     (
         button_open_config,
@@ -590,7 +645,7 @@ def ti_tab(
         button_save_as_config,
         config_file_name,
         button_load_config,
-    ) = gradio_config()
+    ) = gradio_config(headless=headless)
 
     (
         pretrained_model_name_or_path,
@@ -602,7 +657,8 @@ def ti_tab(
         save_model_as_choices=[
             'ckpt',
             'safetensors',
-        ]
+        ],
+        headless=headless,
     )
 
     with gr.Tab('Folders'):
@@ -612,7 +668,7 @@ def ti_tab(
                 placeholder='Folder where the training folders containing the images are located',
             )
             train_data_dir_input_folder = gr.Button(
-                '📂', elem_id='open_folder_small'
+                '📂', elem_id='open_folder_small', visible=(not headless)
             )
             train_data_dir_input_folder.click(
                 get_folder_path,
@@ -624,7 +680,7 @@ def ti_tab(
                 placeholder='(Optional) Folder where where the regularization folders containing the images are located',
             )
             reg_data_dir_input_folder = gr.Button(
-                '📂', elem_id='open_folder_small'
+                '📂', elem_id='open_folder_small', visible=(not headless)
             )
             reg_data_dir_input_folder.click(
                 get_folder_path,
@@ -637,7 +693,7 @@ def ti_tab(
                 placeholder='Folder to output trained model',
             )
             output_dir_input_folder = gr.Button(
-                '📂', elem_id='open_folder_small'
+                '📂', elem_id='open_folder_small', visible=(not headless)
             )
             output_dir_input_folder.click(
                 get_folder_path,
@@ -649,7 +705,7 @@ def ti_tab(
                 placeholder='Optional: enable logging and output TensorBoard log to this folder',
             )
             logging_dir_input_folder = gr.Button(
-                '📂', elem_id='open_folder_small'
+                '📂', elem_id='open_folder_small', visible=(not headless)
             )
             logging_dir_input_folder.click(
                 get_folder_path,
@@ -689,7 +745,9 @@ def ti_tab(
                 label='Resume TI training',
                 placeholder='(Optional) Path to existing TI embeding file to keep training',
             )
-            weights_file_input = gr.Button('📂', elem_id='open_folder_small')
+            weights_file_input = gr.Button(
+                '📂', elem_id='open_folder_small', visible=(not headless)
+            )
             weights_file_input.click(
                 get_file_path,
                 outputs=weights,
@@ -737,6 +795,7 @@ def ti_tab(
             seed,
             caption_extension,
             cache_latents,
+            cache_latents_to_disk,
             optimizer,
             optimizer_args,
         ) = gradio_training(
@@ -774,7 +833,9 @@ def ti_tab(
                     label='VAE',
                     placeholder='(Optiona) path to checkpoint of vae to replace for training',
                 )
-                vae_button = gr.Button('📂', elem_id='open_folder_small')
+                vae_button = gr.Button(
+                    '📂', elem_id='open_folder_small', visible=(not headless)
+                )
                 vae_button.click(
                     get_any_file_path,
                     outputs=vae,
@@ -802,14 +863,18 @@ def ti_tab(
                 bucket_reso_steps,
                 caption_dropout_every_n_epochs,
                 caption_dropout_rate,
-                noise_offset,
+                noise_offset_type,noise_offset,adaptive_noise_scale,
+                multires_noise_iterations,
+                multires_noise_discount,
                 additional_parameters,
                 vae_batch_size,
                 min_snr_gamma,
                 save_every_n_steps,
                 save_last_n_steps,
                 save_last_n_steps_state,
-            ) = gradio_advanced_training()
+                use_wandb,
+                wandb_api_key,
+            ) = gradio_advanced_training(headless=headless)
             color_aug.change(
                 color_aug_changed,
                 inputs=[color_aug],
@@ -832,6 +897,7 @@ def ti_tab(
             reg_data_dir_input=reg_data_dir,
             output_dir_input=output_dir,
             logging_dir_input=logging_dir,
+            headless=headless,
         )
 
     button_run = gr.Button('Train model', variant='primary')
@@ -870,6 +936,7 @@ def ti_tab(
         seed,
         num_cpu_threads_per_process,
         cache_latents,
+        cache_latents_to_disk,
         caption_extension,
         enable_bucket,
         gradient_checkpointing,
@@ -909,7 +976,9 @@ def ti_tab(
         caption_dropout_rate,
         optimizer,
         optimizer_args,
-        noise_offset,
+        noise_offset_type,noise_offset,adaptive_noise_scale,
+        multires_noise_iterations,
+        multires_noise_discount,
         sample_every_n_steps,
         sample_every_n_epochs,
         sample_sampler,
@@ -920,6 +989,8 @@ def ti_tab(
         save_every_n_steps,
         save_last_n_steps,
         save_last_n_steps_state,
+        use_wandb,
+        wandb_api_key,
     ]
 
     button_open_config.click(
@@ -952,7 +1023,7 @@ def ti_tab(
 
     button_run.click(
         train_model,
-        inputs=settings_list,
+        inputs=[dummy_headless] + settings_list,
         show_progress=False,
     )
 
@@ -967,12 +1038,17 @@ def ti_tab(
 def UI(**kwargs):
     css = ''
 
+    headless = kwargs.get('headless', False)
+    print(f'headless: {headless}')
+
     if os.path.exists('./style.css'):
         with open(os.path.join('./style.css'), 'r', encoding='utf8') as file:
             print('Load CSS...')
             css += file.read() + '\n'
 
-    interface = gr.Blocks(css=css)
+    interface = gr.Blocks(
+        css=css, title='Kohya_ss GUI', theme=gr.themes.Default()
+    )
 
     with interface:
         with gr.Tab('Dreambooth TI'):
@@ -981,7 +1057,7 @@ def UI(**kwargs):
                 reg_data_dir_input,
                 output_dir_input,
                 logging_dir_input,
-            ) = ti_tab()
+            ) = ti_tab(headless=headless)
         with gr.Tab('Utilities'):
             utilities_tab(
                 train_data_dir_input=train_data_dir_input,
@@ -989,26 +1065,39 @@ def UI(**kwargs):
                 output_dir_input=output_dir_input,
                 logging_dir_input=logging_dir_input,
                 enable_copy_info_button=True,
+                headless=headless,
             )
 
     # Show the interface
     launch_kwargs = {}
-    if not kwargs.get('username', None) == '':
-        launch_kwargs['auth'] = (
-            kwargs.get('username', None),
-            kwargs.get('password', None),
-        )
-    if kwargs.get('server_port', 0) > 0:
-        launch_kwargs['server_port'] = kwargs.get('server_port', 0)
-    if kwargs.get('inbrowser', False):
-        launch_kwargs['inbrowser'] = kwargs.get('inbrowser', False)
-    print(launch_kwargs)
+    username = kwargs.get('username')
+    password = kwargs.get('password')
+    server_port = kwargs.get('server_port', 0)
+    inbrowser = kwargs.get('inbrowser', False)
+    share = kwargs.get('share', False)
+    server_name = kwargs.get('listen')
+
+    launch_kwargs['server_name'] = server_name
+    if username and password:
+        launch_kwargs['auth'] = (username, password)
+    if server_port > 0:
+        launch_kwargs['server_port'] = server_port
+    if inbrowser:
+        launch_kwargs['inbrowser'] = inbrowser
+    if share:
+        launch_kwargs['share'] = share
     interface.launch(**launch_kwargs)
 
 
 if __name__ == '__main__':
     # torch.cuda.set_per_process_memory_fraction(0.48)
     parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--listen',
+        type=str,
+        default='127.0.0.1',
+        help='IP to listen on for connections to Gradio',
+    )
     parser.add_argument(
         '--username', type=str, default='', help='Username for authentication'
     )
@@ -1024,6 +1113,12 @@ def UI(**kwargs):
     parser.add_argument(
         '--inbrowser', action='store_true', help='Open in browser'
     )
+    parser.add_argument(
+        '--share', action='store_true', help='Share the gradio UI'
+    )
+    parser.add_argument(
+        '--headless', action='store_true', help='Is the server headless'
+    )
 
     args = parser.parse_args()
 
@@ -1032,4 +1127,7 @@ def UI(**kwargs):
         password=args.password,
         inbrowser=args.inbrowser,
         server_port=args.server_port,
+        share=args.share,
+        listen=args.listen,
+        headless=args.headless,
     )
diff --git a/tools/convert_diffusers20_original_sd.py b/tools/convert_diffusers20_original_sd.py
index 15a9ca4ab..b9365b519 100644
--- a/tools/convert_diffusers20_original_sd.py
+++ b/tools/convert_diffusers20_original_sd.py
@@ -24,9 +24,9 @@ def convert(args):
     is_save_ckpt = len(os.path.splitext(args.model_to_save)[1]) > 0
 
     assert not is_load_ckpt or args.v1 != args.v2, f"v1 or v2 is required to load checkpoint / checkpointの読み込みにはv1/v2指定が必要です"
-    assert (
-        is_save_ckpt or args.reference_model is not None
-    ), f"reference model is required to save as Diffusers / Diffusers形式での保存には参照モデルが必要です"
+    # assert (
+    #     is_save_ckpt or args.reference_model is not None
+    # ), f"reference model is required to save as Diffusers / Diffusers形式での保存には参照モデルが必要です"
 
     # モデルを読み込む
     msg = "checkpoint" if is_load_ckpt else ("Diffusers" + (" as fp16" if args.fp16 else ""))
@@ -34,7 +34,7 @@ def convert(args):
 
     if is_load_ckpt:
         v2_model = args.v2
-        text_encoder, vae, unet = model_util.load_models_from_stable_diffusion_checkpoint(v2_model, args.model_to_load)
+        text_encoder, vae, unet = model_util.load_models_from_stable_diffusion_checkpoint(v2_model, args.model_to_load, unet_use_linear_projection_in_v2=args.unet_use_linear_projection)
     else:
         pipe = StableDiffusionPipeline.from_pretrained(
             args.model_to_load, torch_dtype=load_dtype, tokenizer=None, safety_checker=None
@@ -61,7 +61,7 @@ def convert(args):
         )
         print(f"model saved. total converted state_dict keys: {key_count}")
     else:
-        print(f"copy scheduler/tokenizer config from: {args.reference_model}")
+        print(f"copy scheduler/tokenizer config from: {args.reference_model if args.reference_model is not None else 'default model'}")
         model_util.save_diffusers_checkpoint(
             v2_model, args.model_to_save, text_encoder, unet, args.reference_model, vae, args.use_safetensors
         )
@@ -76,6 +76,9 @@ def setup_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--v2", action="store_true", help="load v2.0 model (v1 or v2 is required to load checkpoint) / 2.0のモデルを読み込む"
     )
+    parser.add_argument(
+        "--unet_use_linear_projection", action="store_true", help="When saving v2 model as Diffusers, set U-Net config to `use_linear_projection=true` (to match stabilityai's model) / Diffusers形式でv2モデルを保存するときにU-Netの設定を`use_linear_projection=true`にする（stabilityaiのモデルと合わせる）"
+    )
     parser.add_argument(
         "--fp16",
         action="store_true",
@@ -100,7 +103,7 @@ def setup_parser() -> argparse.ArgumentParser:
         "--reference_model",
         type=str,
         default=None,
-        help="reference model for schduler/tokenizer, required in saving Diffusers, copy schduler/tokenizer from this / scheduler/tokenizerのコピー元のDiffusersモデル、Diffusers形式で保存するときに必要",
+        help="scheduler/tokenizerのコピー元Diffusersモデル、Diffusers形式で保存するときに使用される、省略時は`runwayml/stable-diffusion-v1-5` または `stabilityai/stable-diffusion-2-1` / reference Diffusers model to copy scheduler/tokenizer config from, used when saving as Diffusers format, default is `runwayml/stable-diffusion-v1-5` or `stabilityai/stable-diffusion-2-1`",
     )
     parser.add_argument(
         "--use_safetensors",
diff --git a/tools/crop_images_to_n_buckets.py b/tools/crop_images_to_n_buckets.py
index 688b42b59..e2bdbd085 100644
--- a/tools/crop_images_to_n_buckets.py
+++ b/tools/crop_images_to_n_buckets.py
@@ -21,6 +21,7 @@ def sort_images_by_aspect_ratio(path):
     images = []
     for filename in os.listdir(path):
         if filename.endswith(".jpg") or filename.endswith(".jpeg") or filename.endswith(".png") or filename.endswith(".webp"):
+            print(filename)
             img_path = os.path.join(path, filename)
             images.append((img_path, aspect_ratio(img_path)))
     # sort the list of tuples based on the aspect ratio
@@ -135,13 +136,13 @@ def save_resized_cropped_images(group, folder_name, group_number, avg_aspect_rat
     for i, (img_path, aspect_ratio) in enumerate(group):
         image = cv2.imread(img_path)
         cropped_image = center_crop_image(image, avg_aspect_ratio)
-        resized_image = cv2.resize(cropped_image, (small_width, small_height))
+        # resized_image = cv2.resize(cropped_image, (small_width, small_height))
         if use_original_name:
             save_name = os.path.basename(img_path)
         else:
             save_name = f"group_{group_number}_{i}.jpg"
         save_path = os.path.join(folder_name, save_name)
-        cv2.imwrite(save_path, resized_image)
+        cv2.imwrite(save_path, cropped_image)
         
         # Copy matching files named the same as img_path to
         copy_related_files(img_path, save_path)
diff --git a/tools/cudann_1.8_install.py b/tools/cudann_1.8_install.py
index dec38a17e..2c9d1ca42 100644
--- a/tools/cudann_1.8_install.py
+++ b/tools/cudann_1.8_install.py
@@ -83,8 +83,6 @@ def check_versions():
 # Check for "different" B&B Files and copy only if necessary
 if os.name == "nt":
     python = sys.executable
-    bnb_src = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..\bitsandbytes_windows")
-    bnb_dest = os.path.join(sysconfig.get_paths()["purelib"], "bitsandbytes")
     cudnn_src = os.path.join(os.path.dirname(os.path.realpath(__file__)), "..\cudnn_windows")
     cudnn_dest = os.path.join(sysconfig.get_paths()["purelib"], "torch", "lib")
     
diff --git a/tools/group_images.py b/tools/group_images.py
new file mode 100644
index 000000000..c859cdf81
--- /dev/null
+++ b/tools/group_images.py
@@ -0,0 +1,155 @@
+import argparse
+import shutil
+from PIL import Image, ImageOps
+import os
+import numpy as np
+
+class ImageProcessor:
+
+    def __init__(self, input_folder, output_folder, group_size, include_subfolders, do_not_copy_other_files, pad):
+        self.input_folder = input_folder
+        self.output_folder = output_folder
+        self.group_size = group_size
+        self.include_subfolders = include_subfolders
+        self.do_not_copy_other_files = do_not_copy_other_files
+        self.pad = pad
+        self.image_extensions = ('.png', '.jpg', '.jpeg', '.gif', '.webp')
+
+    def get_image_paths(self):
+        images = []
+        if self.include_subfolders:
+            for dirpath, dirnames, filenames in os.walk(self.input_folder):
+                for filename in filenames:
+                    if filename.endswith(self.image_extensions):
+                        images.append(os.path.join(dirpath, filename))
+        else:
+            images = [os.path.join(self.input_folder, f) for f in os.listdir(self.input_folder) if f.endswith(self.image_extensions)]
+        return images
+
+    def group_images(self, images):
+        sorted_images = sorted(images, key=lambda path: Image.open(path).size[0] / Image.open(path).size[1])
+        groups = [sorted_images[i:i+self.group_size] for i in range(0, len(sorted_images), self.group_size)]
+        return groups
+
+    def process_group(self, group, group_index):
+        if len(group) > 0:
+            aspect_ratios = self.get_aspect_ratios(group)
+            avg_aspect_ratio = np.mean(aspect_ratios)
+            cropped_images = self.crop_images(group, avg_aspect_ratio)
+            self.resize_and_save_images(cropped_images, group_index)
+            if not self.do_not_copy_other_files:
+                self.copy_other_files(group, group_index)
+
+    def get_aspect_ratios(self, group):
+        aspect_ratios = []
+        for path in group:
+            with Image.open(path) as img:
+                width, height = img.size
+                aspect_ratios.append(width / height)
+        return aspect_ratios
+
+    def crop_images(self, group, avg_aspect_ratio):
+        cropped_images = []
+        for j, path in enumerate(group):
+            with Image.open(path) as img:
+                print(f"  Processing image {j+1}: {path}")
+                img = self.crop_image(img, avg_aspect_ratio)
+                cropped_images.append(img)
+        return cropped_images
+
+    def crop_image(self, img, avg_aspect_ratio):
+        img_aspect_ratio = img.width / img.height
+        if img_aspect_ratio > avg_aspect_ratio:
+            # Too wide, reduce width
+            new_width = avg_aspect_ratio * img.height
+            left = (img.width - new_width) / 2
+            right = left + new_width
+            img = img.crop((left, 0, right, img.height))
+        else:
+            # Too tall, reduce height
+            new_height = img.width / avg_aspect_ratio
+            top = (img.height - new_height) / 2
+            bottom = top + new_height
+            img = img.crop((0, top, img.width, bottom))
+        return img
+
+    def resize_and_save_images(self, cropped_images, group_index):
+        max_width = max(img.width for img in cropped_images)
+        max_height = max(img.height for img in cropped_images)
+        for j, img in enumerate(cropped_images):
+            img = img.resize((max_width, max_height))
+            os.makedirs(self.output_folder, exist_ok=True)
+            output_path = os.path.join(self.output_folder, f"group-{group_index+1}-image-{j+1}.jpg")
+            print(f"  Saving processed image to {output_path}")
+            img.convert('RGB').save(output_path)
+
+    def copy_other_files(self, group, group_index):
+        for j, path in enumerate(group):
+            dirpath, original_filename = os.path.split(path)
+            original_basename, original_ext = os.path.splitext(original_filename)
+            for filename in os.listdir(dirpath):
+                if filename.endswith('.npz'):  # Skip .npz
+                    continue
+                basename, ext = os.path.splitext(filename)
+                if basename == original_basename and ext != original_ext:
+                    shutil.copy2(os.path.join(dirpath, filename), os.path.join(self.output_folder, f"group-{group_index+1}-image-{j+1}{ext}"))
+
+    def process_images(self):
+        images = self.get_image_paths()
+        groups = self.group_images(images)
+        for i, group in enumerate(groups):
+            print(f"Processing group {i+1} with {len(group)} images...")
+            self.process_group(group, i)
+            
+    def process_group(self, group, group_index):
+        if len(group) > 0:
+            aspect_ratios = self.get_aspect_ratios(group)
+            avg_aspect_ratio = np.mean(aspect_ratios)
+            if self.pad:
+                padded_images = self.pad_images(group, avg_aspect_ratio)
+                self.resize_and_save_images(padded_images, group_index)
+            else:
+                cropped_images = self.crop_images(group, avg_aspect_ratio)
+                self.resize_and_save_images(cropped_images, group_index)
+            if not self.do_not_copy_other_files:
+                self.copy_other_files(group, group_index)
+
+    def pad_images(self, group, avg_aspect_ratio):
+        padded_images = []
+        for j, path in enumerate(group):
+            with Image.open(path) as img:
+                print(f"  Processing image {j+1}: {path}")
+                img = self.pad_image(img, avg_aspect_ratio)
+                padded_images.append(img)
+        return padded_images
+
+    def pad_image(self, img, avg_aspect_ratio):
+        img_aspect_ratio = img.width / img.height
+        if img_aspect_ratio < avg_aspect_ratio:
+            # Too tall, increase width
+            new_width = avg_aspect_ratio * img.height
+            pad_width = int((new_width - img.width) / 2)
+            img = ImageOps.expand(img, border=(pad_width, 0), fill='black')
+        else:
+            # Too wide, increase height
+            new_height = img.width / avg_aspect_ratio
+            pad_height = int((new_height - img.height) / 2)
+            img = ImageOps.expand(img, border=(0, pad_height), fill='black')
+        return img
+
+def main():
+    parser = argparse.ArgumentParser(description='Process groups of images.')
+    parser.add_argument('input_folder', type=str, help='Input folder containing images')
+    parser.add_argument('output_folder', type=str, help='Output folder to store processed images')
+    parser.add_argument('group_size', type=int, help='Number of images in each group')
+    parser.add_argument('--include_subfolders', action='store_true', help='Include subfolders in search for images')
+    parser.add_argument('--do_not_copy_other_files', '--no_copy', dest='do_not_copy_other_files', action='store_true', help='Do not copy other files with the same name as images')
+    parser.add_argument('--pad', action='store_true', help='Pad images instead of cropping them')
+
+    args = parser.parse_args()
+
+    processor = ImageProcessor(args.input_folder, args.output_folder, args.group_size, args.include_subfolders, args.do_not_copy_other_files, args.pad)
+    processor.process_images()
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/group_images_recommended_size.py b/tools/group_images_recommended_size.py
new file mode 100644
index 000000000..ac0d8a2f1
--- /dev/null
+++ b/tools/group_images_recommended_size.py
@@ -0,0 +1,128 @@
+import argparse
+from PIL import Image
+import os
+import numpy as np
+import itertools
+
+class ImageProcessor:
+
+    def __init__(self, input_folder, min_group, max_group, include_subfolders, pad):
+        self.input_folder = input_folder
+        self.min_group = min_group
+        self.max_group = max_group
+        self.include_subfolders = include_subfolders
+        self.pad = pad
+        self.image_extensions = ('.png', '.jpg', '.jpeg', '.gif', '.webp')
+        self.losses = []  # List to store loss values for each image
+
+    def get_image_paths(self):
+        images = []
+        if self.include_subfolders:
+            for dirpath, dirnames, filenames in os.walk(self.input_folder):
+                for filename in filenames:
+                    if filename.endswith(self.image_extensions):
+                        images.append(os.path.join(dirpath, filename))
+        else:
+            images = [os.path.join(self.input_folder, f) for f in os.listdir(self.input_folder) if f.endswith(self.image_extensions)]
+        return images
+
+    def group_images(self, images, group_size):
+        sorted_images = sorted(images, key=lambda path: Image.open(path).size[0] / Image.open(path).size[1])
+        groups = [sorted_images[i:i+group_size] for i in range(0, len(sorted_images), group_size)]
+        return groups
+
+    def process_group(self, group):
+        if len(group) > 0:
+            aspect_ratios = self.get_aspect_ratios(group)
+            avg_aspect_ratio = np.mean(aspect_ratios)
+            self.calculate_losses(group, avg_aspect_ratio)
+
+    def get_aspect_ratios(self, group):
+        aspect_ratios = []
+        for path in group:
+            with Image.open(path) as img:
+                width, height = img.size
+                aspect_ratios.append(width / height)
+        return aspect_ratios
+
+    def calculate_losses(self, group, avg_aspect_ratio):
+        for j, path in enumerate(group):
+            with Image.open(path) as img:
+                loss = self.calculate_loss(img, avg_aspect_ratio)
+                self.losses.append((path, loss))  # Add (path, loss) tuple to the list
+
+    def calculate_loss(self, img, avg_aspect_ratio):
+        img_aspect_ratio = img.width / img.height
+        if img_aspect_ratio > avg_aspect_ratio:
+            # Too wide, reduce width
+            new_width = avg_aspect_ratio * img.height
+            loss = abs(img.width - new_width) / img.width  # Calculate loss value
+        else:
+            # Too tall, reduce height
+            new_height = img.width / avg_aspect_ratio
+            loss = abs(img.height - new_height) / img.height  # Calculate loss value
+        return loss
+
+    def monte_carlo_optimization(self, groups):
+        best_groups = groups.copy()
+        best_loss = np.inf
+        best_removed_images = []
+
+        for group in groups:
+            num_images = len(group)
+            all_combinations = []
+            # Generate all possible combinations of images to remove
+            for r in range(1, num_images + 1):
+                combinations = list(itertools.combinations(group, r))
+                all_combinations.extend(combinations)
+
+            for combination in all_combinations:
+                self.losses = []  # Reset losses for each combination
+                remaining_images = list(set(group) - set(combination))
+                self.process_group(remaining_images)
+                avg_loss = np.mean(self.losses)
+
+                if avg_loss < best_loss:
+                    best_loss = avg_loss
+                    best_groups[best_groups.index(group)] = remaining_images
+                    best_removed_images = combination
+
+        return best_groups, best_loss, best_removed_images
+
+    def process_images(self):
+        images = self.get_image_paths()
+        num_images = len(images)
+        results = []
+
+        for group_size in range(self.min_group, self.max_group + 1):
+            groups = self.group_images(images, group_size)
+            optimized_groups, avg_loss, removed_images = self.monte_carlo_optimization(groups)
+            num_remaining = num_images % group_size
+
+            results.append((group_size, avg_loss, num_remaining, optimized_groups, removed_images))
+
+        # Sort results based on average crop loss in ascending order
+        sorted_results = sorted(results, key=lambda x: x[1])
+
+        for group_size, avg_loss, num_remaining, optimized_groups, removed_images in sorted_results:
+            print(f"Group size: {group_size}, Average crop loss: {avg_loss}, Number of images remaining: {num_remaining}")
+            print(f"Optimized Groups: {optimized_groups}")
+            print(f"Removed Images: {removed_images}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Process groups of images.')
+    parser.add_argument('input_folder', type=str, help='Input folder containing images')
+    parser.add_argument('min_group', type=int, help='Minimum group size')
+    parser.add_argument('max_group', type=int, help='Maximum group size')
+    parser.add_argument('--include_subfolders', action='store_true', help='Include subfolders in search for images')
+    parser.add_argument('--pad', action='store_true', help='Pad images instead of cropping them')
+
+    args = parser.parse_args()
+
+    processor = ImageProcessor(args.input_folder, args.min_group, args.max_group, args.include_subfolders, args.pad)
+    processor.process_images()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/original_control_net.py b/tools/original_control_net.py
index 4484ce9cd..582794de7 100644
--- a/tools/original_control_net.py
+++ b/tools/original_control_net.py
@@ -62,7 +62,7 @@ def load_control_net(v2, unet, model):
 
   # 重みをU-Netに読み込めるようにする。ControlNetはSD版のstate dictなので、それを読み込む
   is_difference = "difference" in ctrl_sd_sd
-  print("ControlNet: loading difference")
+  print("ControlNet: loading difference:", is_difference)
 
   # ControlNetには存在しないキーがあるので、まず現在のU-NetでSD版の全keyを作っておく
   # またTransfer Controlの元weightとなる
@@ -123,7 +123,8 @@ def canny(img):
 
 def preprocess_ctrl_net_hint_image(image):
   image = np.array(image).astype(np.float32) / 255.0
-  image = image[:, :, ::-1].copy()                         # rgb to bgr
+  # ControlNetのサンプルはcv2を使っているが、読み込みはGradioなので実はRGBになっている
+  # image = image[:, :, ::-1].copy()                         # rgb to bgr
   image = image[None].transpose(0, 3, 1, 2)       # nchw
   image = torch.from_numpy(image)
   return image                              # 0 to 1
diff --git a/tools/update_bitsandbytes.py b/tools/update_bitsandbytes.py
new file mode 100644
index 000000000..ee8b2ae60
--- /dev/null
+++ b/tools/update_bitsandbytes.py
@@ -0,0 +1,49 @@
+import os
+import sysconfig
+import filecmp
+import shutil
+
+def sync_bits_and_bytes_files():
+    """
+    Check for "different" bitsandbytes Files and copy only if necessary.
+    This function is specific for Windows OS.
+    """
+    
+    # Only execute on Windows
+    if os.name != "nt":
+        print("This function is only applicable to Windows OS.")
+        return
+
+    try:
+        # Define source and destination directories
+        source_dir = os.path.join(os.getcwd(), "bitsandbytes_windows")
+
+        dest_dir_base = os.path.join(sysconfig.get_paths()["purelib"], "bitsandbytes")
+        
+        # Clear file comparison cache
+        filecmp.clear_cache()
+        
+        # Iterate over each file in source directory
+        for file in os.listdir(source_dir):
+            source_file_path = os.path.join(source_dir, file)
+
+            # Decide the destination directory based on file name
+            if file in ("main.py", "paths.py"):
+                dest_dir = os.path.join(dest_dir_base, "cuda_setup")
+            else:
+                dest_dir = dest_dir_base
+
+            # Copy file from source to destination, maintaining original file's metadata
+            print(f'Copy {source_file_path} to {dest_dir}')
+            shutil.copy2(source_file_path, dest_dir)
+
+    except FileNotFoundError as fnf_error:
+        print(f"File not found error: {fnf_error}")
+    except PermissionError as perm_error:
+        print(f"Permission error: {perm_error}")
+    except Exception as e:
+        print(f"An unexpected error occurred: {e}")
+
+
+if __name__ == "__main__":
+    sync_bits_and_bytes_files()
\ No newline at end of file
diff --git a/tools/validate_requirements.py b/tools/validate_requirements.py
index 72a241afc..73c58b032 100644
--- a/tools/validate_requirements.py
+++ b/tools/validate_requirements.py
@@ -57,6 +57,9 @@ def check_torch():
     except Exception as e:
         log.error(f'Could not load torch: {e}')
         exit(1)
+from packaging.requirements import Requirement
+from packaging.markers import default_environment
+import re
 
 # Parse command line arguments
 parser = argparse.ArgumentParser(description="Validate that requirements are satisfied.")
@@ -78,27 +81,37 @@ def check_torch():
 # Check each requirement against the installed packages
 missing_requirements = []
 wrong_version_requirements = []
+
+url_requirement_pattern = re.compile(r"(?P<url>https?://.+);?\s?(?P<marker>.+)?")
+
 for requirement in requirements:
     requirement = requirement.strip()
     if requirement == ".":
         # Skip the current requirement if it is a dot (.)
         continue
+    
+    url_match = url_requirement_pattern.match(requirement)
+    if url_match:
+        if url_match.group("marker"):
+            marker = url_match.group("marker")
+            parsed_marker = Marker(marker)
+            if not parsed_marker.evaluate(default_environment()):
+                continue
+        requirement = url_match.group("url")
+
     try:
-        pkg_resources.require(requirement)
+        parsed_req = Requirement(requirement)
+        
+        # Check if the requirement has an environment marker and if it evaluates to False
+        if parsed_req.marker and not parsed_req.marker.evaluate(default_environment()):
+            continue
+        
+        pkg_resources.require(str(parsed_req))
+    except ValueError:
+        # This block will handle URL-based requirements
+        pass
     except pkg_resources.DistributionNotFound:
-        # Check if the requirement contains a VCS URL
-        if "@" in requirement:
-            # If it does, split the requirement into two parts: the package name and the VCS URL
-            package_name, vcs_url = requirement.split("@", 1)
-            # Use pip to install the package from the VCS URL
-            os.system(f"pip install -e {vcs_url}")
-            # Try to require the package again
-            try:
-                pkg_resources.require(package_name)
-            except pkg_resources.DistributionNotFound:
-                missing_requirements.append(requirement)
-        else:
-            missing_requirements.append(requirement)
+        missing_requirements.append(requirement)
     except pkg_resources.VersionConflict as e:
         wrong_version_requirements.append((requirement, str(e.req), e.dist.version))
 
diff --git a/train_db.py b/train_db.py
index 178d5cb4e..7ec06354b 100644
--- a/train_db.py
+++ b/train_db.py
@@ -23,7 +23,14 @@
     BlueprintGenerator,
 )
 import library.custom_train_functions as custom_train_functions
-from library.custom_train_functions import apply_snr_weight, get_weighted_text_embeddings
+from library.custom_train_functions import (
+    apply_snr_weight,
+    get_weighted_text_embeddings,
+    pyramid_noise_like,
+    apply_noise_offset,
+)
+
+# perlin_noise,
 
 
 def train(args):
@@ -92,7 +99,7 @@ def train(args):
     weight_dtype, save_dtype = train_util.prepare_dtype(args)
 
     # モデルを読み込む
-    text_encoder, vae, unet, load_stable_diffusion_format = train_util.load_target_model(args, weight_dtype)
+    text_encoder, vae, unet, load_stable_diffusion_format = train_util.load_target_model(args, weight_dtype, accelerator)
 
     # verify load/save model formats
     if load_stable_diffusion_format:
@@ -196,6 +203,9 @@ def train(args):
     else:
         unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(unet, optimizer, train_dataloader, lr_scheduler)
 
+    # transform DDP after prepare
+    text_encoder, unet = train_util.transform_if_model_is_DDP(text_encoder, unet)
+
     if not train_text_encoder:
         text_encoder.to(accelerator.device, dtype=weight_dtype)  # to avoid 'cpu' vs 'cuda' error
 
@@ -237,7 +247,7 @@ def train(args):
     loss_list = []
     loss_total = 0.0
     for epoch in range(num_train_epochs):
-        print(f"epoch {epoch+1}/{num_train_epochs}")
+        print(f"\nepoch {epoch+1}/{num_train_epochs}")
         current_epoch.value = epoch + 1
 
         # 指定したステップ数までText Encoderを学習する：epoch最初の状態
@@ -268,8 +278,11 @@ def train(args):
                 # Sample noise that we'll add to the latents
                 noise = torch.randn_like(latents, device=latents.device)
                 if args.noise_offset:
-                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
-                    noise += args.noise_offset * torch.randn((latents.shape[0], latents.shape[1], 1, 1), device=latents.device)
+                    noise = apply_noise_offset(latents, noise, args.noise_offset, args.adaptive_noise_scale)
+                elif args.multires_noise_iterations:
+                    noise = pyramid_noise_like(noise, latents.device, args.multires_noise_iterations, args.multires_noise_discount)
+                # elif args.perlin_noise:
+                #     noise = perlin_noise(noise, latents.device, args.perlin_noise)  # only shape of noise is used currently
 
                 # Get the text embedding for conditioning
                 with torch.set_grad_enabled(global_step < args.stop_text_encoder_training):
@@ -297,7 +310,8 @@ def train(args):
                 noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
 
                 # Predict the noise residual
-                noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
+                with accelerator.autocast():
+                    noise_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
 
                 if args.v_parameterization:
                     # v-parameterization training
@@ -361,7 +375,7 @@ def train(args):
             current_loss = loss.detach().item()
             if args.logging_dir is not None:
                 logs = {"loss": current_loss, "lr": float(lr_scheduler.get_last_lr()[0])}
-                if args.optimizer_type.lower() == "DAdaptation".lower():  # tracking d*lr value
+                if args.optimizer_type.lower().startswith("DAdapt".lower()):  # tracking d*lr value
                     logs["lr/d*lr"] = (
                         lr_scheduler.optimizers[0].param_groups[0]["d"] * lr_scheduler.optimizers[0].param_groups[0]["lr"]
                     )
diff --git a/train_network.py b/train_network.py
index 5c4d5ad19..f8db030c4 100644
--- a/train_network.py
+++ b/train_network.py
@@ -1,4 +1,3 @@
-from torch.nn.parallel import DistributedDataParallel as DDP
 import importlib
 import argparse
 import gc
@@ -26,7 +25,7 @@
 )
 import library.huggingface_util as huggingface_util
 import library.custom_train_functions as custom_train_functions
-from library.custom_train_functions import apply_snr_weight, get_weighted_text_embeddings
+from library.custom_train_functions import apply_snr_weight, get_weighted_text_embeddings, pyramid_noise_like, apply_noise_offset
 
 
 # TODO 他のスクリプトと共通化する
@@ -44,7 +43,7 @@ def generate_step_logs(args: argparse.Namespace, current_loss, avr_loss, lr_sche
             logs["lr/textencoder"] = float(lrs[0])
             logs["lr/unet"] = float(lrs[-1])  # may be same to textencoder
 
-        if args.optimizer_type.lower() == "DAdaptation".lower():  # tracking d*lr value of unet.
+        if args.optimizer_type.lower().startswith("DAdapt".lower()):  # tracking d*lr value of unet.
             logs["lr/d*lr"] = lr_scheduler.optimizers[-1].param_groups[0]["d"] * lr_scheduler.optimizers[-1].param_groups[0]["lr"]
     else:
         idx = 0
@@ -54,7 +53,7 @@ def generate_step_logs(args: argparse.Namespace, current_loss, avr_loss, lr_sche
 
         for i in range(idx, len(lrs)):
             logs[f"lr/group{i}"] = float(lrs[i])
-            if args.optimizer_type.lower() == "DAdaptation".lower():
+            if args.optimizer_type.lower().startswith("DAdapt".lower()):
                 logs[f"lr/d*lr/group{i}"] = (
                     lr_scheduler.optimizers[-1].param_groups[i]["d"] * lr_scheduler.optimizers[-1].param_groups[i]["lr"]
                 )
@@ -81,25 +80,25 @@ def train(args):
     # データセットを準備する
     blueprint_generator = BlueprintGenerator(ConfigSanitizer(True, True, True))
     if use_user_config:
-        print(f"Load dataset config from {args.dataset_config}")
+        print(f"Loading dataset config from {args.dataset_config}")
         user_config = config_util.load_user_config(args.dataset_config)
         ignored = ["train_data_dir", "reg_data_dir", "in_json"]
         if any(getattr(args, attr) is not None for attr in ignored):
             print(
-                "ignore following options because config file is found: {0} / 設定ファイルが利用されるため以下のオプションは無視されます: {0}".format(
+                "ignoring the following options because config file is found: {0} / 設定ファイルが利用されるため以下のオプションは無視されます: {0}".format(
                     ", ".join(ignored)
                 )
             )
     else:
         if use_dreambooth_method:
-            print("Use DreamBooth method.")
+            print("Using DreamBooth method.")
             user_config = {
                 "datasets": [
                     {"subsets": config_util.generate_dreambooth_subsets_config_by_subdirs(args.train_data_dir, args.reg_data_dir)}
                 ]
             }
         else:
-            print("Train with captions.")
+            print("Training with captions.")
             user_config = {
                 "datasets": [
                     {
@@ -136,7 +135,7 @@ def train(args):
         ), "when caching latents, either color_aug or random_crop cannot be used / latentをキャッシュするときはcolor_augとrandom_cropは使えません"
 
     # acceleratorを準備する
-    print("prepare accelerator")
+    print("preparing accelerator")
     accelerator, unwrap_model = train_util.prepare_accelerator(args)
     is_main_process = accelerator.is_main_process
 
@@ -144,28 +143,34 @@ def train(args):
     weight_dtype, save_dtype = train_util.prepare_dtype(args)
 
     # モデルを読み込む
-    for pi in range(accelerator.state.num_processes):
-        # TODO: modify other training scripts as well
-        if pi == accelerator.state.local_process_index:
-            print(f"loading model for process {accelerator.state.local_process_index}/{accelerator.state.num_processes}")
+    text_encoder, vae, unet, _ = train_util.load_target_model(args, weight_dtype, accelerator)
 
-            text_encoder, vae, unet, _ = train_util.load_target_model(
-                args, weight_dtype, accelerator.device if args.lowram else "cpu"
-            )
+    # モデルに xformers とか memory efficient attention を組み込む
+    train_util.replace_unet_modules(unet, args.mem_eff_attn, args.xformers)
+    
+    # 差分追加学習のためにモデルを読み込む
+    import sys
 
-            # work on low-ram device
-            if args.lowram:
-                text_encoder.to(accelerator.device)
-                unet.to(accelerator.device)
-                vae.to(accelerator.device)
+    sys.path.append(os.path.dirname(__file__))
+    print("import network module:", args.network_module)
+    network_module = importlib.import_module(args.network_module)
 
-            gc.collect()
-            torch.cuda.empty_cache()
-        accelerator.wait_for_everyone()
+    if args.base_weights is not None:
+        # base_weights が指定されている場合は、指定された重みを読み込みマージする
+        for i, weight_path in enumerate(args.base_weights):
+            if args.base_weights_multiplier is None or len(args.base_weights_multiplier) <= i:
+                multiplier = 1.0
+            else:
+                multiplier = args.base_weights_multiplier[i]
 
-    # モデルに xformers とか memory efficient attention を組み込む
-    train_util.replace_unet_modules(unet, args.mem_eff_attn, args.xformers)
+            print(f"merging module: {weight_path} with multiplier {multiplier}")
+
+            module, weights_sd = network_module.create_network_from_weights(
+                multiplier, weight_path, vae, text_encoder, unet, for_inference=True
+            )
+            module.merge_to(text_encoder, unet, weights_sd, weight_dtype, accelerator.device if args.lowram else "cpu")
 
+        print(f"all weights merged: {', '.join(args.base_weights)}")
     # 学習を準備する
     if cache_latents:
         vae.to(accelerator.device, dtype=weight_dtype)
@@ -181,12 +186,6 @@ def train(args):
         accelerator.wait_for_everyone()
 
     # prepare network
-    import sys
-
-    sys.path.append(os.path.dirname(__file__))
-    print("import network module:", args.network_module)
-    network_module = importlib.import_module(args.network_module)
-
     net_kwargs = {}
     if args.network_args is not None:
         for net_arg in args.network_args:
@@ -194,7 +193,10 @@ def train(args):
             net_kwargs[key] = value
 
     # if a new network is added in future, add if ~ then blocks for each network (;'∀')
-    network = network_module.create_network(1.0, args.network_dim, args.network_alpha, vae, text_encoder, unet, **net_kwargs)
+    if args.dim_from_weights:
+        network, _ = network_module.create_network_from_weights(1, args.network_weights, vae, text_encoder, unet, **net_kwargs)
+    else:
+        network = network_module.create_network(1.0, args.network_dim, args.network_alpha, vae, text_encoder, unet, **net_kwargs)
     if network is None:
         return
 
@@ -207,7 +209,7 @@ def train(args):
 
     if args.network_weights is not None:
         info = network.load_weights(args.network_weights)
-        print(f"load network weights from {args.network_weights}: {info}")
+        print(f"loaded network weights from {args.network_weights}: {info}")
 
     if args.gradient_checkpointing:
         unet.enable_gradient_checkpointing()
@@ -215,7 +217,7 @@ def train(args):
         network.enable_gradient_checkpointing()  # may have no effect
 
     # 学習に必要なクラスを準備する
-    print("prepare optimizer, data loader etc.")
+    print("preparing optimizer, data loader etc.")
 
     # 後方互換性を確保するよ
     try:
@@ -260,7 +262,7 @@ def train(args):
         assert (
             args.mixed_precision == "fp16"
         ), "full_fp16 requires mixed precision='fp16' / full_fp16を使う場合はmixed_precision='fp16'を指定してください。"
-        print("enable full fp16 training.")
+        print("enabling full fp16 training.")
         network.to(weight_dtype)
 
     # acceleratorがなんかよろしくやってくれるらしい
@@ -279,6 +281,9 @@ def train(args):
     else:
         network, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(network, optimizer, train_dataloader, lr_scheduler)
 
+    # transform DDP after prepare (train_network here only)
+    text_encoder, unet, network = train_util.transform_if_model_is_DDP(text_encoder, unet, network)
+
     unet.requires_grad_(False)
     unet.to(accelerator.device, dtype=weight_dtype)
     text_encoder.requires_grad_(False)
@@ -288,20 +293,11 @@ def train(args):
         text_encoder.train()
 
         # set top parameter requires_grad = True for gradient checkpointing works
-        if type(text_encoder) == DDP:
-            text_encoder.module.text_model.embeddings.requires_grad_(True)
-        else:
-            text_encoder.text_model.embeddings.requires_grad_(True)
+        text_encoder.text_model.embeddings.requires_grad_(True)
     else:
         unet.eval()
         text_encoder.eval()
 
-    # support DistributedDataParallel
-    if type(text_encoder) == DDP:
-        text_encoder = text_encoder.module
-        unet = unet.module
-        network = network.module
-
     network.prepare_grad_etc(text_encoder, unet)
 
     if not cache_latents:
@@ -366,6 +362,9 @@ def train(args):
         "ss_seed": args.seed,
         "ss_lowram": args.lowram,
         "ss_noise_offset": args.noise_offset,
+        "ss_multires_noise_iterations": args.multires_noise_iterations,
+        "ss_multires_noise_discount": args.multires_noise_discount,
+        "ss_adaptive_noise_scale": args.adaptive_noise_scale,
         "ss_training_comment": args.training_comment,  # will not be updated after training
         "ss_sd_scripts_commit_hash": train_util.get_git_revision_hash(),
         "ss_optimizer": optimizer_name + (f"({optimizer_args})" if len(optimizer_args) > 0 else ""),
@@ -544,17 +543,18 @@ def train(args):
     loss_total = 0.0
     del train_dataset_group
 
-    # if hasattr(network, "on_step_start"):
-    #     on_step_start = network.on_step_start
-    # else:
-    #     on_step_start = lambda *args, **kwargs: None
+    # callback for step start
+    if hasattr(network, "on_step_start"):
+        on_step_start = network.on_step_start
+    else:
+        on_step_start = lambda *args, **kwargs: None
 
     # function for saving/removing
     def save_model(ckpt_name, unwrapped_nw, steps, epoch_no, force_sync_upload=False):
         os.makedirs(args.output_dir, exist_ok=True)
         ckpt_file = os.path.join(args.output_dir, ckpt_name)
 
-        print(f"saving checkpoint: {ckpt_file}")
+        print(f"\nsaving checkpoint: {ckpt_file}")
         metadata["ss_training_finished_at"] = str(time.time())
         metadata["ss_steps"] = str(steps)
         metadata["ss_epoch"] = str(epoch_no)
@@ -572,7 +572,7 @@ def remove_model(old_ckpt_name):
     # training loop
     for epoch in range(num_train_epochs):
         if is_main_process:
-            print(f"epoch {epoch+1}/{num_train_epochs}")
+            print(f"\nepoch {epoch+1}/{num_train_epochs}")
         current_epoch.value = epoch + 1
 
         metadata["ss_epoch"] = str(epoch + 1)
@@ -582,7 +582,7 @@ def remove_model(old_ckpt_name):
         for step, batch in enumerate(train_dataloader):
             current_step.value = global_step
             with accelerator.accumulate(network):
-                # on_step_start(text_encoder, unet)
+                on_step_start(text_encoder, unet)
 
                 with torch.no_grad():
                     if "latents" in batch and batch["latents"] is not None:
@@ -607,11 +607,13 @@ def remove_model(old_ckpt_name):
                     else:
                         input_ids = batch["input_ids"].to(accelerator.device)
                         encoder_hidden_states = train_util.get_hidden_states(args, input_ids, tokenizer, text_encoder, weight_dtype)
+
                 # Sample noise that we'll add to the latents
                 noise = torch.randn_like(latents, device=latents.device)
                 if args.noise_offset:
-                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
-                    noise += args.noise_offset * torch.randn((latents.shape[0], latents.shape[1], 1, 1), device=latents.device)
+                    noise = apply_noise_offset(latents, noise, args.noise_offset, args.adaptive_noise_scale)
+                elif args.multires_noise_iterations:
+                    noise = pyramid_noise_like(noise, latents.device, args.multires_noise_iterations, args.multires_noise_discount)
 
                 # Sample a random timestep for each image
                 timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (b_size,), device=latents.device)
@@ -733,7 +735,7 @@ def remove_model(old_ckpt_name):
     if is_main_process:
         ckpt_name = train_util.get_last_ckpt_name(args, "." + args.save_model_as)
         save_model(ckpt_name, network, global_step, num_train_epochs, force_sync_upload=True)
-        
+
         print("model saved.")
 
 
@@ -780,6 +782,25 @@ def setup_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--training_comment", type=str, default=None, help="arbitrary comment string stored in metadata / メタデータに記録する任意のコメント文字列"
     )
+    parser.add_argument(
+        "--dim_from_weights",
+        action="store_true",
+        help="automatically determine dim (rank) from network_weights / dim (rank)をnetwork_weightsで指定した重みから自動で決定する",
+    )
+    parser.add_argument(
+        "--base_weights",
+        type=str,
+        default=None,
+        nargs="*",
+        help="network weights to merge into the model before training / 学習前にあらかじめモデルにマージするnetworkの重みファイル",
+    )
+    parser.add_argument(
+        "--base_weights_multiplier",
+        type=float,
+        default=None,
+        nargs="*",
+        help="multiplier for network weights to merge into the model before training / 学習前にあらかじめモデルにマージするnetworkの重みの倍率",
+    )
 
     return parser
 
diff --git a/train_textual_inversion.py b/train_textual_inversion.py
index fb6b6053d..b73027de5 100644
--- a/train_textual_inversion.py
+++ b/train_textual_inversion.py
@@ -20,7 +20,7 @@
     BlueprintGenerator,
 )
 import library.custom_train_functions as custom_train_functions
-from library.custom_train_functions import apply_snr_weight
+from library.custom_train_functions import apply_snr_weight, pyramid_noise_like, apply_noise_offset
 
 imagenet_templates_small = [
     "a photo of a {}",
@@ -98,7 +98,7 @@ def train(args):
     weight_dtype, save_dtype = train_util.prepare_dtype(args)
 
     # モデルを読み込む
-    text_encoder, vae, unet, _ = train_util.load_target_model(args, weight_dtype)
+    text_encoder, vae, unet, _ = train_util.load_target_model(args, weight_dtype, accelerator)
 
     # Convert the init_word to token_id
     if args.init_word is not None:
@@ -280,6 +280,9 @@ def train(args):
         text_encoder, optimizer, train_dataloader, lr_scheduler
     )
 
+    # transform DDP after prepare
+    text_encoder, unet = train_util.transform_if_model_is_DDP(text_encoder, unet)
+
     index_no_updates = torch.arange(len(tokenizer)) < token_ids[0]
     # print(len(index_no_updates), torch.sum(index_no_updates))
     orig_embeds_params = unwrap_model(text_encoder).get_input_embeddings().weight.data.detach().clone()
@@ -344,7 +347,7 @@ def save_model(ckpt_name, embs, steps, epoch_no, force_sync_upload=False):
         os.makedirs(args.output_dir, exist_ok=True)
         ckpt_file = os.path.join(args.output_dir, ckpt_name)
 
-        print(f"saving checkpoint: {ckpt_file}")
+        print(f"\nsaving checkpoint: {ckpt_file}")
         save_weights(ckpt_file, embs, save_dtype)
         if args.huggingface_repo_id is not None:
             huggingface_util.upload(args, ckpt_file, "/" + ckpt_name, force_sync_upload=force_sync_upload)
@@ -357,7 +360,7 @@ def remove_model(old_ckpt_name):
 
     # training loop
     for epoch in range(num_train_epochs):
-        print(f"epoch {epoch+1}/{num_train_epochs}")
+        print(f"\nepoch {epoch+1}/{num_train_epochs}")
         current_epoch.value = epoch + 1
 
         text_encoder.train()
@@ -384,8 +387,9 @@ def remove_model(old_ckpt_name):
                 # Sample noise that we'll add to the latents
                 noise = torch.randn_like(latents, device=latents.device)
                 if args.noise_offset:
-                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
-                    noise += args.noise_offset * torch.randn((latents.shape[0], latents.shape[1], 1, 1), device=latents.device)
+                    noise = apply_noise_offset(latents, noise, args.noise_offset, args.adaptive_noise_scale)
+                elif args.multires_noise_iterations:
+                    noise = pyramid_noise_like(noise, latents.device, args.multires_noise_iterations, args.multires_noise_discount)
 
                 # Sample a random timestep for each image
                 timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (b_size,), device=latents.device)
@@ -460,7 +464,7 @@ def remove_model(old_ckpt_name):
             current_loss = loss.detach().item()
             if args.logging_dir is not None:
                 logs = {"loss": current_loss, "lr": float(lr_scheduler.get_last_lr()[0])}
-                if args.optimizer_type.lower() == "DAdaptation".lower():  # tracking d*lr value
+                if args.optimizer_type.lower().startswith("DAdapt".lower()):  # tracking d*lr value
                     logs["lr/d*lr"] = (
                         lr_scheduler.optimizers[0].param_groups[0]["d"] * lr_scheduler.optimizers[0].param_groups[0]["lr"]
                     )
diff --git a/train_textual_inversion_XTI.py b/train_textual_inversion_XTI.py
index 69ec3eb1f..8c8f7e8b5 100644
--- a/train_textual_inversion_XTI.py
+++ b/train_textual_inversion_XTI.py
@@ -20,7 +20,7 @@
     BlueprintGenerator,
 )
 import library.custom_train_functions as custom_train_functions
-from library.custom_train_functions import apply_snr_weight
+from library.custom_train_functions import apply_snr_weight, pyramid_noise_like, apply_noise_offset
 from XTI_hijack import unet_forward_XTI, downblock_forward_XTI, upblock_forward_XTI
 
 imagenet_templates_small = [
@@ -104,7 +104,7 @@ def train(args):
     weight_dtype, save_dtype = train_util.prepare_dtype(args)
 
     # モデルを読み込む
-    text_encoder, vae, unet, _ = train_util.load_target_model(args, weight_dtype)
+    text_encoder, vae, unet, _ = train_util.load_target_model(args, weight_dtype, accelerator)
 
     # Convert the init_word to token_id
     if args.init_word is not None:
@@ -314,6 +314,9 @@ def train(args):
         text_encoder, optimizer, train_dataloader, lr_scheduler
     )
 
+    # transform DDP after prepare
+    text_encoder, unet = train_util.transform_if_model_is_DDP(text_encoder, unet)
+
     index_no_updates = torch.arange(len(tokenizer)) < token_ids_XTI[0]
     # print(len(index_no_updates), torch.sum(index_no_updates))
     orig_embeds_params = unwrap_model(text_encoder).get_input_embeddings().weight.data.detach().clone()
@@ -378,7 +381,7 @@ def save_model(ckpt_name, embs, steps, epoch_no, force_sync_upload=False):
         os.makedirs(args.output_dir, exist_ok=True)
         ckpt_file = os.path.join(args.output_dir, ckpt_name)
 
-        print(f"saving checkpoint: {ckpt_file}")
+        print(f"\nsaving checkpoint: {ckpt_file}")
         save_weights(ckpt_file, embs, save_dtype)
         if args.huggingface_repo_id is not None:
             huggingface_util.upload(args, ckpt_file, "/" + ckpt_name, force_sync_upload=force_sync_upload)
@@ -391,7 +394,7 @@ def remove_model(old_ckpt_name):
 
     # training loop
     for epoch in range(num_train_epochs):
-        print(f"epoch {epoch+1}/{num_train_epochs}")
+        print(f"\nepoch {epoch+1}/{num_train_epochs}")
         current_epoch.value = epoch + 1
 
         text_encoder.train()
@@ -423,8 +426,9 @@ def remove_model(old_ckpt_name):
                 # Sample noise that we'll add to the latents
                 noise = torch.randn_like(latents, device=latents.device)
                 if args.noise_offset:
-                    # https://www.crosslabs.org//blog/diffusion-with-offset-noise
-                    noise += args.noise_offset * torch.randn((latents.shape[0], latents.shape[1], 1, 1), device=latents.device)
+                    noise = apply_noise_offset(latents, noise, args.noise_offset, args.adaptive_noise_scale)
+                elif args.multires_noise_iterations:
+                    noise = pyramid_noise_like(noise, latents.device, args.multires_noise_iterations, args.multires_noise_discount)
 
                 # Sample a random timestep for each image
                 timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (b_size,), device=latents.device)
@@ -499,7 +503,7 @@ def remove_model(old_ckpt_name):
             current_loss = loss.detach().item()
             if args.logging_dir is not None:
                 logs = {"loss": current_loss, "lr": float(lr_scheduler.get_last_lr()[0])}
-                if args.optimizer_type.lower() == "DAdaptation".lower():  # tracking d*lr value
+                if args.optimizer_type.lower().startswith("DAdapt".lower()):  # tracking d*lr value
                     logs["lr/d*lr"] = (
                         lr_scheduler.optimizers[0].param_groups[0]["d"] * lr_scheduler.optimizers[0].param_groups[0]["lr"]
                     )