Skip to content

Commit

Permalink
Update visual_chatgpt.py
Browse files Browse the repository at this point in the history
  • Loading branch information
BinayakJha authored Mar 14, 2023
1 parent 7eacd5d commit f4b1e2d
Showing 1 changed file with 32 additions and 40 deletions.
72 changes: 32 additions & 40 deletions visual_chatgpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def get_new_image_name(org_img_name, func_name="update"):

class MaskFormer:
def __init__(self, device):
print("Initializing MaskFormer to %s" % device)
print(f"Initializing MaskFormer to {device}")
self.device = device
self.processor = CLIPSegProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
self.model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined").to(device)
Expand Down Expand Up @@ -217,9 +217,7 @@ def inference(self, inputs):

class Text2Image:
def __init__(self, device):

print(f"Initializing Text2Image to {device}")

self.device = device
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
self.pipe = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5",
Expand Down Expand Up @@ -290,7 +288,7 @@ def inference(self, inputs):

class CannyText2Image:
def __init__(self, device):
print("Initializing CannyText2Image to %s" % device)
print(f"Initializing CannyText2Image to {device}")
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-canny",
torch_dtype=self.torch_dtype)
Expand All @@ -302,7 +300,7 @@ def __init__(self, device):
self.seed = -1
self.a_prompt = 'best quality, extremely detailed'
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
'fewer digits, cropped, worst quality, low quality'
'fewer digits, cropped, worst quality, low quality'

@prompts(name="Generate Image Condition On Canny Image",
description="useful when you want to generate a new real image from both the user desciption and a canny image."
Expand All @@ -315,7 +313,7 @@ def inference(self, inputs):
image = Image.open(image_path)
self.seed = random.randint(0, 65535)
seed_everything(self.seed)
prompt = instruct_text + ', ' + self.a_prompt
prompt = f'{instruct_text}, {self.a_prompt}'
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
guidance_scale=9.0).images[0]
updated_image_path = get_new_image_name(image_path, func_name="canny2image")
Expand Down Expand Up @@ -346,7 +344,7 @@ def inference(self, inputs):

class LineText2Image:
def __init__(self, device):
print("Initializing LineText2Image to %s" % device)
print(f"Initializing LineText2Image to {device}")
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-mlsd",
torch_dtype=self.torch_dtype)
Expand All @@ -359,7 +357,7 @@ def __init__(self, device):
self.seed = -1
self.a_prompt = 'best quality, extremely detailed'
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
'fewer digits, cropped, worst quality, low quality'
'fewer digits, cropped, worst quality, low quality'

@prompts(name="Generate Image Condition On Line Image",
description="useful when you want to generate a new real image from both the user desciption "
Expand All @@ -369,11 +367,11 @@ def __init__(self, device):
"The input to this tool should be a comma seperated string of two, "
"representing the image_path and the user description. ")
def inference(self, inputs):
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
image_path, instruct_text = inputs.split(",")[0], ','.join(inputs.split(',')[1:])
image = Image.open(image_path)
self.seed = random.randint(0, 65535)
seed_everything(self.seed)
prompt = instruct_text + ', ' + self.a_prompt
prompt = f'{instruct_text}, {self.a_prompt}'
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
guidance_scale=9.0).images[0]
updated_image_path = get_new_image_name(image_path, func_name="line2image")
Expand Down Expand Up @@ -404,7 +402,7 @@ def inference(self, inputs):

class HedText2Image:
def __init__(self, device):
print("Initializing HedText2Image to %s" % device)
print(f"Initializing HedText2Image to {device}")
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-hed",
torch_dtype=self.torch_dtype)
Expand All @@ -417,7 +415,7 @@ def __init__(self, device):
self.seed = -1
self.a_prompt = 'best quality, extremely detailed'
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
'fewer digits, cropped, worst quality, low quality'
'fewer digits, cropped, worst quality, low quality'

@prompts(name="Generate Image Condition On Soft Hed Boundary Image",
description="useful when you want to generate a new real image from both the user desciption "
Expand All @@ -431,7 +429,7 @@ def inference(self, inputs):
image = Image.open(image_path)
self.seed = random.randint(0, 65535)
seed_everything(self.seed)
<prompt = instruct_text + ', ' + self.a_prompt
prompt = f'{instruct_text}, {self.a_prompt}'
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
guidance_scale=9.0).images[0]
updated_image_path = get_new_image_name(image_path, func_name="hed2image")
Expand All @@ -441,7 +439,6 @@ def inference(self, inputs):
return updated_image_path



class Image2Scribble:
def __init__(self, device):
print("Initializing Image2Scribble")
Expand All @@ -463,7 +460,7 @@ def inference(self, inputs):

class ScribbleText2Image:
def __init__(self, device):
print("Initializing ScribbleText2Image to %s" % device)
print(f"Initializing ScribbleText2Image to {device}")
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-scribble",
torch_dtype=self.torch_dtype)
Expand All @@ -476,7 +473,7 @@ def __init__(self, device):
self.seed = -1
self.a_prompt = 'best quality, extremely detailed'
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, ' \
'fewer digits, cropped, worst quality, low quality'
'fewer digits, cropped, worst quality, low quality'

@prompts(name="Generate Image Condition On Sketch Image",
description="useful when you want to generate a new real image from both the user desciption and "
Expand All @@ -488,14 +485,14 @@ def inference(self, inputs):
image = Image.open(image_path)
self.seed = random.randint(0, 65535)
seed_everything(self.seed)
prompt = instruct_text + ', ' + self.a_prompt
prompt = f'{instruct_text}, {self.a_prompt}'
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
guidance_scale=9.0).images[0]
updated_image_path = get_new_image_name(image_path, func_name="scribble2image")
image.save(updated_image_path)
print(f"\nProcessed ScribbleText2Image, Input Scribble: {image_path}, Input Text: {instruct_text}, "
f"Output Image: {updated_image_path}")
return
return updated_image_path


class Image2Pose:
Expand All @@ -518,7 +515,7 @@ def inference(self, inputs):

class PoseText2Image:
def __init__(self, device):
print("Initializing PoseText2Image to %s" % device)
print(f"Initializing PoseText2Image to {device}")
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-openpose",
torch_dtype=self.torch_dtype)
Expand All @@ -532,7 +529,7 @@ def __init__(self, device):
self.unconditional_guidance_scale = 9.0
self.a_prompt = 'best quality, extremely detailed'
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
' fewer digits, cropped, worst quality, low quality'
' fewer digits, cropped, worst quality, low quality'

@prompts(name="Generate Image Condition On Pose Image",
description="useful when you want to generate a new real image from both the user desciption "
Expand All @@ -546,7 +543,7 @@ def inference(self, inputs):
image = Image.open(image_path)
self.seed = random.randint(0, 65535)
seed_everything(self.seed)
prompt = instruct_text + ', ' + self.a_prompt
prompt = f'{instruct_text}, {self.a_prompt}'
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
guidance_scale=9.0).images[0]
updated_image_path = get_new_image_name(image_path, func_name="pose2image")
Expand Down Expand Up @@ -625,7 +622,7 @@ def inference(self, inputs):

class SegText2Image:
def __init__(self, device):
print("Initializing SegText2Image to %s" % device)
print(f"Initializing SegText2Image to {device}")
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
self.controlnet = ControlNetModel.from_pretrained("fusing/stable-diffusion-v1-5-controlnet-seg",
torch_dtype=self.torch_dtype)
Expand All @@ -637,7 +634,7 @@ def __init__(self, device):
self.seed = -1
self.a_prompt = 'best quality, extremely detailed'
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
' fewer digits, cropped, worst quality, low quality'
' fewer digits, cropped, worst quality, low quality'

@prompts(name="Generate Image Condition On Segmentations",
description="useful when you want to generate a new real image from both the user desciption and segmentations. "
Expand All @@ -650,7 +647,7 @@ def inference(self, inputs):
image = Image.open(image_path)
self.seed = random.randint(0, 65535)
seed_everything(self.seed)
prompt = instruct_text + ', ' + self.a_prompt
prompt = f'{instruct_text}, {self.a_prompt}'
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
guidance_scale=9.0).images[0]
updated_image_path = get_new_image_name(image_path, func_name="segment2image")
Expand Down Expand Up @@ -684,7 +681,7 @@ def inference(self, inputs):

class DepthText2Image:
def __init__(self, device):
print("Initializing DepthText2Image to %s" % device)
print(f"Initializing DepthText2Image to {device}")
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
self.controlnet = ControlNetModel.from_pretrained(
"fusing/stable-diffusion-v1-5-controlnet-depth", torch_dtype=self.torch_dtype)
Expand All @@ -696,7 +693,7 @@ def __init__(self, device):
self.seed = -1
self.a_prompt = 'best quality, extremely detailed'
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
' fewer digits, cropped, worst quality, low quality'
' fewer digits, cropped, worst quality, low quality'

@prompts(name="Generate Image Condition On Depth",
description="useful when you want to generate a new real image from both the user desciption and depth image. "
Expand All @@ -709,7 +706,7 @@ def inference(self, inputs):
image = Image.open(image_path)
self.seed = random.randint(0, 65535)
seed_everything(self.seed)
prompt = instruct_text + ', ' + self.a_prompt
prompt = f'{instruct_text}, {self.a_prompt}'
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
guidance_scale=9.0).images[0]
updated_image_path = get_new_image_name(image_path, func_name="depth2image")
Expand All @@ -719,7 +716,6 @@ def inference(self, inputs):
return updated_image_path



class Image2Normal:
def __init__(self, device):
print("Initializing Image2Normal")
Expand Down Expand Up @@ -756,7 +752,7 @@ def inference(self, inputs):

class NormalText2Image:
def __init__(self, device):
print("Initializing NormalText2Image to %s" % device)
print(f"Initializing NormalText2Image to {device}")
self.torch_dtype = torch.float16 if 'cuda' in device else torch.float32
self.controlnet = ControlNetModel.from_pretrained(
"fusing/stable-diffusion-v1-5-controlnet-normal", torch_dtype=self.torch_dtype)
Expand All @@ -768,7 +764,7 @@ def __init__(self, device):
self.seed = -1
self.a_prompt = 'best quality, extremely detailed'
self.n_prompt = 'longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit,' \
' fewer digits, cropped, worst quality, low quality'
' fewer digits, cropped, worst quality, low quality'

@prompts(name="Generate Image Condition On Normal Map",
description="useful when you want to generate a new real image from both the user desciption and normal map. "
Expand All @@ -781,7 +777,7 @@ def inference(self, inputs):
image = Image.open(image_path)
self.seed = random.randint(0, 65535)
seed_everything(self.seed)
prompt = instruct_text + ', ' + self.a_prompt
prompt = f'{instruct_text}, {self.a_prompt}'
image = self.pipe(prompt, image, num_inference_steps=20, eta=0.0, negative_prompt=self.n_prompt,
guidance_scale=9.0).images[0]
updated_image_path = get_new_image_name(image_path, func_name="normal2image")
Expand Down Expand Up @@ -825,12 +821,12 @@ def __init__(self, load_dict):
self.llm = OpenAI(temperature=0)
self.memory = ConversationBufferMemory(memory_key="chat_history", output_key='output')

self.models = dict()
self.models = {}
for class_name, device in load_dict.items():
self.models[class_name] = globals()[class_name](device=device)

self.tools = []
for class_name, instance in self.models.items():
for instance in self.models.values():
for e in dir(instance):
if e.startswith('inference'):
func = getattr(instance, e)
Expand All @@ -857,7 +853,7 @@ def run_text(self, text, state):
return state, state

def run_image(self, image, state, txt):
image_filename = os.path.join('image', str(uuid.uuid4())[:8] + ".png")
image_filename = os.path.join('image', f"{str(uuid.uuid4())[:8]}.png")
print("======>Auto Resize Image...")
img = Image.open(image.name)
width, height = img.size
Expand All @@ -870,17 +866,13 @@ def run_image(self, image, state, txt):
img.save(image_filename, "PNG")
print(f"Resize image form {width}x{height} to {width_new}x{height_new}")
description = self.models['ImageCaptioning'].inference(image_filename)
Human_prompt = "\nHuman: provide a figure named {}. The description is: {}. " \
"This information helps you to understand this image, " \
"but you should use tools to finish following tasks, " \
"rather than directly imagine from my description. If you understand, say \"Received\". \n".format(
image_filename, description)
Human_prompt = f'\nHuman: provide a figure named {image_filename}. The description is: {description}. This information helps you to understand this image, but you should use tools to finish following tasks, rather than directly imagine from my description. If you understand, say \"Received\". \n'
AI_prompt = "Received. "
self.agent.memory.buffer = self.agent.memory.buffer + Human_prompt + 'AI: ' + AI_prompt
state = state + [(f"![](/file={image_filename})*{image_filename}*", AI_prompt)]
print(f"\nProcessed run_image, Input image: {image_filename}\nCurrent state: {state}\n"
f"Current Memory: {self.agent.memory.buffer}")
return state, state, txt + ' ' + image_filename + ' '
return state, state, f'{txt} {image_filename} '


if __name__ == '__main__':
Expand Down

0 comments on commit f4b1e2d

Please sign in to comment.