microsoft · jwyang · Mar 15, 2024 · Jan 14, 2024 · Jan 14, 2024 · Jan 14, 2024
diff --git a/.env.example b/.env.example
@@ -0,0 +1,9 @@
+AWS_ACCESS_KEY_ID=
+AWS_SECRET_ACCESS_KEY=
+AWS_REGION=
+GITHUB_OWNER=
+GITHUB_REPO=
+GITHUB_TOKEN=
+PROJECT_NAME=
+# optional
+OPENAI_API_KEY=
diff --git a/.gitignore b/.gitignore
@@ -158,3 +158,5 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
+
+*.sw[m-p]
diff --git a/Dockerfile b/Dockerfile
@@ -0,0 +1,43 @@
+FROM nvidia/cuda:12.3.1-devel-ubuntu22.04
+
+# Install system dependencies
+RUN apt-get update && \
+    apt-get install -y \
+      python3-pip python3-dev git ninja-build wget \
+      ffmpeg libsm6 libxext6 \
+      openmpi-bin libopenmpi-dev && \
+    ln -sf /usr/bin/python3 /usr/bin/python && \
+    ln -sf /usr/bin/pip3 /usr/bin/pip
+
+# Set the working directory in the container
+WORKDIR /usr/src/app
+
+# Copy the current directory contents into the container at /usr/src/app
+COPY . .
+
+ENV FORCE_CUDA=1
+
+# Upgrade pip
+RUN python -m pip install --upgrade pip
+
+# Install Python dependencies
+RUN pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu123 \
+    && pip install git+https://github.com/UX-Decoder/Segment-Everything-Everywhere-All-At-Once.git@33f2c898fdc8d7c95dda014a4b9ebe4e413dbb2b \
+    && pip install git+https://github.com/facebookresearch/segment-anything.git \
+    && pip install git+https://github.com/UX-Decoder/Semantic-SAM.git@package \
+    && cd ops && bash make.sh && cd .. \
+    && pip install mpi4py \
+    && pip install openai \
+    && pip install gradio==4.17.0
+
+# Download pretrained models
+RUN sh download_ckpt.sh
+
+# Make port 6092 available to the world outside this container
+EXPOSE 6092
+
+# Make Gradio server accessible outside 127.0.0.1
+ENV GRADIO_SERVER_NAME="0.0.0.0"
+
+RUN chmod +x /usr/src/app/entrypoint.sh
+CMD ["/usr/src/app/entrypoint.sh"]
diff --git a/README.md b/README.md
@@ -158,3 +158,11 @@ If you find our work helpful for your research, please consider citing the follo
       journal={arXiv preprint arXiv:2310.11441},
       year={2023},
 }
+```
+
+## Deploy to AWS
+
+To deploy SoM to EC2 on AWS via Github Actions:
+
+1. Fork this repository and clone your fork to your local machine.
+2. Follow the instructions at the top of `deploy.py`.
diff --git a/client.py b/client.py
@@ -0,0 +1,36 @@
+"""
+This module provides a command-line interface to interact with the SoM server.
+
+The server URL is printed during deployment via `python deploy.py run`.
+
+Usage:
+    python client.py "http://<server_ip>:6092"
+"""
+
+import fire
+from gradio_client import Client
+from loguru import logger
+
+def predict(server_url: str):
+    """
+    Makes a prediction using the Gradio client with the provided IP address.
+
+    Args:
+        server_url (str): The URL of the SoM Gradio server.
+    """
+    client = Client(server_url)
+    result = client.predict(
+        {
+            "background": "https://raw.githubusercontent.com/gradio-app/gradio/main/test/test_files/bus.png",
+        },           # filepath in 'parameter_1' Image component
+        2.5,         # float (numeric value between 1 and 3) in 'Granularity' Slider component
+        "Automatic", # Literal['Automatic', 'Interactive'] in 'Segmentation Mode' Radio component
+        0.5,         # float (numeric value between 0 and 1) in 'Mask Alpha' Slider component
+        "Number",    # Literal['Number', 'Alphabet'] in 'Mark Mode' Radio component
+        ["Mark"],    # List[Literal['Mask', 'Box', 'Mark']] in 'Annotation Mode' Checkboxgroup component
+        api_name="/inference"
+    )
+    logger.info(result)
+
+if __name__ == "__main__":
+    fire.Fire(predict)
diff --git a/demo_gpt4v_som.py b/demo_gpt4v_som.py
@@ -83,6 +83,10 @@
 def inference(image, slider, mode, alpha, label_mode, anno_mode, *args, **kwargs):
     global history_images; history_images = []
     global history_masks; history_masks = []    
+
+    _image = image['background'].convert('RGB')
+    _mask = image['layers'][0].convert('L') if image['layers'] else None
+
     if slider < 1.5:
         model_name = 'seem'
     elif slider > 2.5:
@@ -119,26 +123,26 @@ def inference(image, slider, mode, alpha, label_mode, anno_mode, *args, **kwargs
         semantic=False
 
         if mode == "Interactive":
-            labeled_array, num_features = label(np.asarray(image['mask'].convert('L')))
+            labeled_array, num_features = label(np.asarray(_mask))
             spatial_masks = torch.stack([torch.from_numpy(labeled_array == i+1) for i in range(num_features)])
 
         if model_name == 'semantic-sam':
             model = model_semsam
-            output, mask = inference_semsam_m2m_auto(model, image['image'], level, text, text_part, text_thresh, text_size, hole_scale, island_scale, semantic, label_mode=label_mode, alpha=alpha, anno_mode=anno_mode, *args, **kwargs)
+            output, mask = inference_semsam_m2m_auto(model, _image, level, text, text_part, text_thresh, text_size, hole_scale, island_scale, semantic, label_mode=label_mode, alpha=alpha, anno_mode=anno_mode, *args, **kwargs)
 
         elif model_name == 'sam':
             model = model_sam
             if mode == "Automatic":
-                output, mask = inference_sam_m2m_auto(model, image['image'], text_size, label_mode, alpha, anno_mode)
+                output, mask = inference_sam_m2m_auto(model, _image, text_size, label_mode, alpha, anno_mode)
             elif mode == "Interactive":
-                output, mask = inference_sam_m2m_interactive(model, image['image'], spatial_masks, text_size, label_mode, alpha, anno_mode)
+                output, mask = inference_sam_m2m_interactive(model, _image, spatial_masks, text_size, label_mode, alpha, anno_mode)
 
         elif model_name == 'seem':
             model = model_seem
             if mode == "Automatic":
-                output, mask = inference_seem_pano(model, image['image'], text_size, label_mode, alpha, anno_mode)
+                output, mask = inference_seem_pano(model, _image, text_size, label_mode, alpha, anno_mode)
             elif mode == "Interactive":
-                output, mask = inference_seem_interactive(model, image['image'], spatial_masks, text_size, label_mode, alpha, anno_mode)
+                output, mask = inference_seem_interactive(model, _image, spatial_masks, text_size, label_mode, alpha, anno_mode)
 
         # convert output to PIL image
         history_masks.append(mask)
@@ -173,30 +177,16 @@ def highlight(mode, alpha, label_mode, anno_mode, *args, **kwargs):
         sections.append((mask_i, r))
     return (history_images[0], sections)
 
-class ImageMask(gr.components.Image):
-    """
-    Sets: source="canvas", tool="sketch"
-    """
-
-    is_template = True
-
-    def __init__(self, **kwargs):
-        super().__init__(source="upload", tool="sketch", interactive=True, **kwargs)
-
-    def preprocess(self, x):
-        return super().preprocess(x)
-
 '''
 launch app
 '''
 
 demo = gr.Blocks()
-image = ImageMask(label="Input", type="pil", brush_radius=20.0, brush_color="#FFFFFF", height=512)
-# image = gr.Image(label="Input", type="pil", height=512)
+image = gr.ImageMask(label="Input", type="pil", sources=["upload"], interactive=True, brush=gr.Brush(colors=["#FFFFFF"]))
 slider = gr.Slider(1, 3, value=1.8, label="Granularity") # info="Choose in [1, 1.5), [1.5, 2.5), [2.5, 3] for [seem, semantic-sam (multi-level), sam]"
 mode = gr.Radio(['Automatic', 'Interactive', ], value='Automatic', label="Segmentation Mode")
 anno_mode = gr.CheckboxGroup(choices=["Mark", "Mask", "Box"], value=['Mark'], label="Annotation Mode")
-image_out = gr.AnnotatedImage(label="SoM Visual Prompt",type="pil", height=512)
+image_out = gr.AnnotatedImage(label="SoM Visual Prompt", height=512)
 runBtn = gr.Button("Run")
 highlightBtn = gr.Button("Highlight")
 bot = gr.Chatbot(label="GPT-4V + SoM", height=256)

diff --git a/demo_som.py b/demo_som.py
@@ -62,14 +62,17 @@
 
 @torch.no_grad()
 def inference(image, slider, mode, alpha, label_mode, anno_mode, *args, **kwargs):
+    _image = image['background'].convert('RGB')
+    _mask = image['layers'][0].convert('L') if image['layers'] else None
+
     if slider < 1.5:
         model_name = 'seem'
     elif slider > 2.5:
         model_name = 'sam'
     else:
         if mode == 'Automatic':
             model_name = 'semantic-sam'
-            if slider < 1.5 + 0.14:                
+            if slider < 1.5 + 0.14:
                 level = [1]
             elif slider < 1.5 + 0.28:
                 level = [2]
@@ -98,48 +101,35 @@ def inference(image, slider, mode, alpha, label_mode, anno_mode, *args, **kwargs
         semantic=False
 
         if mode == "Interactive":
-            labeled_array, num_features = label(np.asarray(image['mask'].convert('L')))
+            labeled_array, num_features = label(np.asarray(_mask))
             spatial_masks = torch.stack([torch.from_numpy(labeled_array == i+1) for i in range(num_features)])
 
         if model_name == 'semantic-sam':
             model = model_semsam
-            output, mask = inference_semsam_m2m_auto(model, image['image'], level, text, text_part, text_thresh, text_size, hole_scale, island_scale, semantic, label_mode=label_mode, alpha=alpha, anno_mode=anno_mode, *args, **kwargs)
+            output, mask = inference_semsam_m2m_auto(model, _image, level, text, text_part, text_thresh, text_size, hole_scale, island_scale, semantic, label_mode=label_mode, alpha=alpha, anno_mode=anno_mode, *args, **kwargs)
 
         elif model_name == 'sam':
             model = model_sam
             if mode == "Automatic":
-                output, mask = inference_sam_m2m_auto(model, image['image'], text_size, label_mode, alpha, anno_mode)
+                output, mask = inference_sam_m2m_auto(model, _image, text_size, label_mode, alpha, anno_mode)
             elif mode == "Interactive":
-                output, mask = inference_sam_m2m_interactive(model, image['image'], spatial_masks, text_size, label_mode, alpha, anno_mode)
+                output, mask = inference_sam_m2m_interactive(model, _image, spatial_masks, text_size, label_mode, alpha, anno_mode)
 
         elif model_name == 'seem':
             model = model_seem
             if mode == "Automatic":
-                output, mask = inference_seem_pano(model, image['image'], text_size, label_mode, alpha, anno_mode)
+                output, mask = inference_seem_pano(model, _image, text_size, label_mode, alpha, anno_mode)
             elif mode == "Interactive":
-                output, mask = inference_seem_interactive(model, image['image'], spatial_masks, text_size, label_mode, alpha, anno_mode)
+                output, mask = inference_seem_interactive(model, _image, spatial_masks, text_size, label_mode, alpha, anno_mode)
 
         return output
 
-class ImageMask(gr.components.Image):
-    """
-    Sets: source="canvas", tool="sketch"
-    """
-
-    is_template = True
-
-    def __init__(self, **kwargs):
-        super().__init__(source="upload", tool="sketch", interactive=True, **kwargs)
-
-    def preprocess(self, x):
-        return super().preprocess(x)
-
 '''
 launch app
 '''
 
 demo = gr.Blocks()
-image = ImageMask(label="Input", type="pil", brush_radius=20.0, brush_color="#FFFFFF")
+image = gr.ImageMask(label="Input", type="pil", sources=["upload"], interactive=True, brush=gr.Brush(colors=["#FFFFFF"]))
 slider = gr.Slider(1, 3, value=2, label="Granularity", info="Choose in [1, 1.5), [1.5, 2.5), [2.5, 3] for [seem, semantic-sam (multi-level), sam]")
 mode = gr.Radio(['Automatic', 'Interactive', ], value='Automatic', label="Segmentation Mode")
 image_out = gr.Image(label="Auto generation",type="pil")
@@ -168,7 +158,7 @@ def preprocess(self, x):
         with gr.Column():
             image_out.render()
             runBtn.render()
-    with gr.Row():    
+    with gr.Row():
         example = gr.Examples(
             examples=[
                 ["examples/ironing_man.jpg"],
@@ -189,4 +179,3 @@ def preprocess(self, x):
               outputs = image_out)
 
 demo.queue().launch(share=True,server_port=6092)
-