Support for Marigold

semjon00 · affromero · semjon00 · commit c7e6a569db73 · 2023-12-18T19:44:04.000+02:00
Co-authored-by: Andres Romero &lt;me@afromero.co&gt;
diff --git a/.gitignore b/.gitignore
@@ -1,2 +1,3 @@
 __pycache__/
 venv/
+.idea/
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,6 @@
 ## Changelog
+### 0.4.5
+ * Support for [Marigold](https://marigoldmonodepth.github.io). [PR #385](https://github.com/thygate/stable-diffusion-webui-depthmap-script/pull/385).
 ### 0.4.4
  * Compatibility with stable-diffusion-webui 1.6.0
 ### 0.4.3 video processing tab
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 ﻿# High Resolution Depth Maps for Stable Diffusion WebUI
 This program is an addon for [AUTOMATIC1111's Stable Diffusion WebUI](https://github.com/AUTOMATIC1111/stable-diffusion-webui) that creates depth maps. Using either generated or custom depth maps, it can also create 3D stereo image pairs (side-by-side or anaglyph), normalmaps and 3D meshes. The outputs of the script can be viewed directly or used as an asset for a 3D engine. Please see [wiki](https://github.com/thygate/stable-diffusion-webui-depthmap-script/wiki/Viewing-Results) to learn more. The program has integration with [Rembg](https://github.com/danielgatis/rembg). It also supports batch processing, processing of videos, and can also be run in standalone mode, without Stable Diffusion WebUI.
 
-To generate realistic depth maps from individual images, this script uses code and models from the [MiDaS](https://github.com/isl-org/MiDaS) and [ZoeDepth](https://github.com/isl-org/ZoeDepth) repositories by Intel ISL, or LeReS from the [AdelaiDepth](https://github.com/aim-uofa/AdelaiDepth) repository by Advanced Intelligent Machines. Multi-resolution merging as implemented by [BoostingMonocularDepth](https://github.com/compphoto/BoostingMonocularDepth) is used to generate high resolution depth maps.
+To generate realistic depth maps from individual images, this script uses code and models from the [Marigold](https://github.com/prs-eth/Marigold/) repository, from the [MiDaS](https://github.com/isl-org/MiDaS) and [ZoeDepth](https://github.com/isl-org/ZoeDepth) repositories by Intel ISL, or LeReS from the [AdelaiDepth](https://github.com/aim-uofa/AdelaiDepth) repository by Advanced Intelligent Machines. Multi-resolution merging as implemented by [BoostingMonocularDepth](https://github.com/compphoto/BoostingMonocularDepth) is used to generate high resolution depth maps.
 
 Stereoscopic images are created using a custom-written algorithm.
 
@@ -198,3 +198,16 @@ ZoeDepth :
   copyright = {arXiv.org perpetual, non-exclusive license}
 }
 ```
+
+Marigold - Repurposing Diffusion-Based Image Generators for Monocular Depth Estimation:
+
+```
+@misc{ke2023repurposing,
+      title={Repurposing Diffusion-Based Image Generators for Monocular Depth Estimation}, 
+      author={Bingxin Ke and Anton Obukhov and Shengyu Huang and Nando Metzger and Rodrigo Caye Daudt and Konrad Schindler},
+      year={2023},
+      eprint={2312.02145},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
diff --git a/bundled_sources.txt b/bundled_sources.txt
@@ -17,3 +17,6 @@ https://github.com/aim-uofa/AdelaiDepth/tree/main/LeReS/Minist_Test/lib/
 
 pix2pix
 https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix/
+
+Marigold
+https://github.com/prs-eth/Marigold/tree/22437a
diff --git a/install.py b/install.py
@@ -38,6 +38,8 @@ def ensure(module_name, min_version=None):
     launch.run_pip('install "moviepy==1.0.2"', "moviepy requirement for depthmap script")
 ensure('transforms3d', '0.4.1')
 
+ensure('diffusers', '0.20.1')  # For Merigold
+
 ensure('imageio')  # 2.4.1
 try:  # Dirty hack to not reinstall every time
     importlib_metadata.version('imageio-ffmpeg')
diff --git a/requirements.txt b/requirements.txt
@@ -16,5 +16,6 @@ transforms3d>=0.4.1
 imageio>=2.4.1,<3.0
 imageio-ffmpeg
 networkx>=2.5
+diffusers>=0.20.1 # For Marigold
 pyqt5; sys_platform == 'windows'
 pyqt6; sys_platform != 'windows'
diff --git a/scripts/depthmap.py b/scripts/depthmap.py
@@ -85,11 +85,16 @@ def add_option(name, default_value, description, name_prefix='depthmap_script'):
         shared.opts.add_option(f"{name_prefix}_{name}", shared.OptionInfo(default_value, description, section=section))
 
     add_option('keepmodels', False, "Do not unload depth and pix2pix models.")
+
     add_option('boost_rmax', 1600, "Maximum wholesize for boost (Rmax)")
+    add_option('marigold_ensembles', 5, "How many ensembles to use for Marigold")
+    add_option('marigold_steps', 10, "How many denoising steps to use for Marigold")
+
     add_option('save_ply', False, "Save additional PLY file with 3D inpainted mesh.")
     add_option('show_3d', True, "Enable showing 3D Meshes in output tab. (Experimental)")
     add_option('show_3d_inpaint', True, "Also show 3D Inpainted Mesh in 3D Mesh output tab. (Experimental)")
     add_option('mesh_maxsize', 2048, "Max size for generating simple mesh.")
+
     add_option('gen_heatmap_from_ui', False, "Show an option to generate HeatMap in the UI")
     add_option('extra_stereomodes', False, "Enable more possible outputs for stereoimage generation")
 
diff --git a/src/backbone.py b/src/backbone.py
@@ -4,6 +4,7 @@
 import pathlib
 from datetime import datetime
 import enum
+import sys
 
 
 class BackboneType(enum.Enum):
@@ -34,12 +35,13 @@ def get_cmd_opt(name, default):
 
     def gather_ops():
         """Parameters for depthmap generation"""
-        from modules.shared import cmd_opts
         ops = {}
-        if get_opt('depthmap_script_boost_rmax', None) is not None:
-            ops['boost_whole_size_threshold'] = get_opt('depthmap_script_boost_rmax', None)
-        ops['precision'] = cmd_opts.precision
-        ops['no_half'] = cmd_opts.no_half
+        for s in ['boost_rmax', 'precision', 'no_half', 'marigold_ensembles', 'marigold_steps']:
+            c = get_opt('depthmap_script_' + s, None)
+            if c is None:
+                c = get_cmd_opt(s, None)
+            if c is not None:
+                ops[s] = c
         return ops
 
 
@@ -117,7 +119,12 @@ def get_opt(name, default): return default  # Configuring is not supported
 
     def get_cmd_opt(name, default): return default  # Configuring is not supported
 
-    def gather_ops(): return {}  # Configuring is not supported
+    def gather_ops():  # Configuring is not supported
+        return {'boost_rmax': 1600,
+                'precision': 'autocast',
+                'no_half': False,
+                'marigold_ensembles': 5,
+                'marigold_steps': 12}
 
     def get_outpath(): return str(pathlib.Path('.', 'outputs'))
 
diff --git a/src/common_ui.py b/src/common_ui.py
@@ -37,7 +37,8 @@ def main_ui_panel(is_depth_tab):
                                                       'dpt_beit_large_384 (midas 3.1)', 'dpt_large_384 (midas 3.0)',
                                                       'dpt_hybrid_384 (midas 3.0)',
                                                       'midas_v21', 'midas_v21_small',
-                                                      'zoedepth_n (indoor)', 'zoedepth_k (outdoor)', 'zoedepth_nk'],
+                                                      'zoedepth_n (indoor)', 'zoedepth_k (outdoor)', 'zoedepth_nk',
+                                                      'Marigold v1'],
                                              type="index")
         with gr.Box() as cur_option_root:
             inp -= 'depthmap_gen_row_1', cur_option_root
diff --git a/src/depthmap_generation.py b/src/depthmap_generation.py
@@ -21,6 +21,8 @@
 from lib.multi_depth_model_woauxi import RelDepthModel
 from lib.net_tools import strip_prefix_if_present
 from pix2pix.models.pix2pix4depth_model import Pix2Pix4DepthModel
+# Marigold
+from marigold.marigold import MarigoldPipeline
 # pix2pix/merge net imports
 from pix2pix.options.test_options import TestOptions
 
@@ -42,18 +44,11 @@ def __init__(self):
         self.resize_mode = None
         self.normalization = None
 
-        # Settings (initialized to sensible values, should be updated)
-        self.boost_whole_size_threshold = 1600  # R_max from the paper by default
-        self.no_half = False
-        self.precision = "autocast"
 
-    def update_settings(self, boost_whole_size_threshold=None, no_half=None, precision=None):
-        if boost_whole_size_threshold is not None:
-            self.boost_whole_size_threshold = boost_whole_size_threshold
-        if no_half is not None:
-            self.no_half = no_half
-        if precision is not None:
-            self.precision = precision
+    def update_settings(self, **kvargs):
+        # Opens the pandora box
+        for k, v in kvargs.items():
+            setattr(self, k, v)
 
 
     def ensure_models(self, model_type, device: torch.device, boost: bool):
@@ -71,9 +66,11 @@ def load_models(self, model_type, device: torch.device, boost: bool):
         """Ensure that the depth model is loaded"""
 
         # model path and name
+        # ZoeDepth and Marigold do not use this
         model_dir = "./models/midas"
         if model_type == 0:
             model_dir = "./models/leres"
+
         # create paths to model if not present
         os.makedirs(model_dir, exist_ok=True)
         os.makedirs('./models/pix2pix', exist_ok=True)
@@ -194,12 +191,26 @@ def load_models(self, model_type, device: torch.device, boost: bool):
             conf = get_config("zoedepth_nk", "infer")
             model = build_model(conf)
 
-        model.eval()  # prepare for evaluation
+        elif model_type == 10:  # Marigold v1
+            model_path = "Bingxin/Marigold"
+            print(model_path)
+            dtype = torch.float32 if self.no_half else torch.float16
+            model = MarigoldPipeline.from_pretrained(model_path, torch_dtype=dtype)
+            try:
+                import xformers
+                model.enable_xformers_memory_efficient_attention()
+            except:
+                pass  # run without xformers
+
+        if model_type in range(0, 10):
+            model.eval()  # prepare for evaluation
         # optimize
-        if device == torch.device("cuda") and model_type in [0, 1, 2, 3, 4, 5, 6]:
-            model = model.to(memory_format=torch.channels_last)  # TODO: weird
-            if not self.no_half and model_type != 0 and not boost:  # TODO: zoedepth, too?
-                model = model.half()
+        if device == torch.device("cuda"):
+            if model_type in [0, 1, 2, 3, 4, 5, 6]:
+                model = model.to(memory_format=torch.channels_last)  # TODO: weird
+            if not self.no_half:
+                if model_type in [1, 2, 3, 4, 5, 6] and not boost:  # TODO: zoedepth, too?
+                    model = model.half()
         model.to(device)  # to correct device
 
         self.depth_model = model
@@ -238,7 +249,8 @@ def get_default_net_size(model_type):
             6: [256, 256],
             7: [384, 512],
             8: [384, 768],
-            9: [384, 512]
+            9: [384, 512],
+            10: [768, 768]
         }
         if model_type in sizes:
             return sizes[model_type]
@@ -288,14 +300,17 @@ def get_raw_prediction(self, input, net_width, net_height):
                 raw_prediction = estimateleres(img, self.depth_model, net_width, net_height)
             elif self.depth_model_type in [7, 8, 9]:
                 raw_prediction = estimatezoedepth(input, self.depth_model, net_width, net_height)
-            else:
+            elif self.depth_model_type in [1, 2, 3, 4, 5, 6]:
                 raw_prediction = estimatemidas(img, self.depth_model, net_width, net_height,
                                                self.resize_mode, self.normalization, self.no_half,
                                                self.precision == "autocast")
+            elif self.depth_model_type == 10:
+                raw_prediction = estimatemarigold(img, self.depth_model, net_width, net_height,
+                                                  self.marigold_ensembles, self.marigold_steps)
         else:
             raw_prediction = estimateboost(img, self.depth_model, self.depth_model_type, self.pix2pix_model,
-                                           self.boost_whole_size_threshold)
-        raw_prediction_invert = self.depth_model_type in [0, 7, 8, 9]
+                                           self.boost_rmax)
+        raw_prediction_invert = self.depth_model_type in [0, 7, 8, 9, 10]
         return raw_prediction, raw_prediction_invert
 
 
@@ -395,6 +410,19 @@ def estimatemidas(img, model, w, h, resize_mode, normalization, no_half, precisi
     return prediction
 
 
+# TODO: correct values for BOOST
+# TODO: "h" is not used
+def estimatemarigold(image, model, w, h, marigold_ensembles=5, marigold_steps=12):
+    # This hideous thing should be re-implemented once there is support from the upstream.
+    img = cv2.cvtColor((image * 255.0001).astype('uint8'), cv2.COLOR_BGR2RGB)
+    img = Image.fromarray(img)
+    with torch.no_grad():
+        pipe_out = model(img, processing_res=w, show_progress_bar=False,
+                         ensemble_size=marigold_ensembles, denoising_steps=marigold_steps,
+                         match_input_res=False)
+        return cv2.resize(pipe_out.depth_np, (image.shape[:2][::-1]), interpolation=cv2.INTER_CUBIC)
+
+
 class ImageandPatchs:
     def __init__(self, root_dir, name, patchsinfo, rgb_image, scale=1):
         self.root_dir = root_dir
@@ -616,7 +644,7 @@ def estimateboost(img, model, model_type, pix2pixmodel, whole_size_threshold):
     elif model_type == 1:  # dpt_beit_large_512
         net_receptive_field_size = 512
         patch_netsize = 2 * net_receptive_field_size
-    else:  # other midas
+    else:  # other midas  # TODO Marigold support
         net_receptive_field_size = 384
         patch_netsize = 2 * net_receptive_field_size
 
@@ -886,6 +914,8 @@ def doubleestimate(img, size1, size2, pix2pixsize, model, net_type, pix2pixmodel
 def singleestimate(img, msize, model, net_type):
     if net_type == 0:
         return estimateleres(img, model, msize, msize)
+    elif net_type == 10:
+        return estimatemarigold(img, model, msize, msize)
     elif net_type >= 7:
         # np to PIL
         return estimatezoedepth(Image.fromarray(np.uint8(img * 255)).convert('RGB'), model, msize, msize)
diff --git a/src/misc.py b/src/misc.py
@@ -15,7 +15,7 @@ def get_commit_hash():
 
 REPOSITORY_NAME = "stable-diffusion-webui-depthmap-script"
 SCRIPT_NAME = "DepthMap"
-SCRIPT_VERSION = "v0.4.4"
+SCRIPT_VERSION = "v0.4.5"
 SCRIPT_FULL_NAME = f"{SCRIPT_NAME} {SCRIPT_VERSION} ({get_commit_hash()})"
 
 
diff --git a/src/stereoimage_generation.py b/src/stereoimage_generation.py
@@ -75,6 +75,7 @@ def create_stereoimages(original_image, depthmap, divergence, separation=0.0, mo
 
 
 def apply_stereo_divergence(original_image, depth, divergence, separation, stereo_offset_exponent, fill_technique):
+    assert original_image.shape[:2] == depth.shape, 'Depthmap and the image must have the same size'
     depth_min = depth.min()
     depth_max = depth.max()
     normalized_depth = (depth - depth_min) / (depth_max - depth_min)

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,3 @@`
`1`	`1`	`__pycache__/`
`2`	`2`	`venv/`
	`3`	`+.idea/`