-
Notifications
You must be signed in to change notification settings - Fork 119
/
chatapi.prompt
339 lines (295 loc) · 14 KB
/
chatapi.prompt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
import math
class ImagePatch:
"""A Python class containing a crop of an image centered around a particular object, as well as relevant information.
Attributes
----------
cropped_image : array_like
An array-like of the cropped image taken from the original image.
left, lower, right, upper : int
An int describing the position of the (left/lower/right/upper) border of the crop's bounding box in the original image.
Methods
-------
find(object_name: str)->List[ImagePatch]
Returns a list of new ImagePatch objects containing crops of the image centered around any objects found in the
image matching the object_name.
exists(object_name: str)->bool
Returns True if the object specified by object_name is found in the image, and False otherwise.
verify_property(property: str)->bool
Returns True if the property is met, and False otherwise.
best_text_match(option_list: List[str], prefix: str)->str
Returns the string that best matches the image.
simple_query(question: str=None)->str
Returns the answer to a basic question asked about the image. If no question is provided, returns the answer to "What is this?".
llm_query(question: str, long_answer: bool)->str
References a large language model (e.g., GPT) to produce a response to the given question. Default is short-form answers, can be made long-form responses with the long_answer flag.
compute_depth()->float
Returns the median depth of the image crop.
crop(left: int, lower: int, right: int, upper: int)->ImagePatch
Returns a new ImagePatch object containing a crop of the image at the given coordinates.
"""
def __init__(self, image, left: int = None, lower: int = None, right: int = None, upper: int = None):
"""Initializes an ImagePatch object by cropping the image at the given coordinates and stores the coordinates as
attributes. If no coordinates are provided, the image is left unmodified, and the coordinates are set to the
dimensions of the image.
Parameters
-------
image : array_like
An array-like of the original image.
left, lower, right, upper : int
An int describing the position of the (left/lower/right/upper) border of the crop's bounding box in the original image.
"""
if left is None and right is None and upper is None and lower is None:
self.cropped_image = image
self.left = 0
self.lower = 0
self.right = image.shape[2] # width
self.upper = image.shape[1] # height
else:
self.cropped_image = image[:, lower:upper, left:right]
self.left = left
self.upper = upper
self.right = right
self.lower = lower
self.width = self.cropped_image.shape[2]
self.height = self.cropped_image.shape[1]
self.horizontal_center = (self.left + self.right) / 2
self.vertical_center = (self.lower + self.upper) / 2
def find(self, object_name: str) -> List[ImagePatch]:
"""Returns a list of ImagePatch objects matching object_name contained in the crop if any are found.
Otherwise, returns an empty list.
Parameters
----------
object_name : str
the name of the object to be found
Returns
-------
List[ImagePatch]
a list of ImagePatch objects matching object_name contained in the crop
Examples
--------
>>> # return the foo
>>> def execute_command(image) -> List[ImagePatch]:
>>> image_patch = ImagePatch(image)
>>> foo_patches = image_patch.find("foo")
>>> return foo_patches
"""
return find_in_image(self.cropped_image, object_name)
def exists(self, object_name: str) -> bool:
"""Returns True if the object specified by object_name is found in the image, and False otherwise.
Parameters
-------
object_name : str
A string describing the name of the object to be found in the image.
Examples
-------
>>> # Are there both foos and garply bars in the photo?
>>> def execute_command(image)->str:
>>> image_patch = ImagePatch(image)
>>> is_foo = image_patch.exists("foo")
>>> is_garply_bar = image_patch.exists("garply bar")
>>> return bool_to_yesno(is_foo and is_garply_bar)
"""
return len(self.find(object_name)) > 0
def verify_property(self, object_name: str, visual_property: str) -> bool:
"""Returns True if the object possesses the visual property, and False otherwise.
Differs from 'exists' in that it presupposes the existence of the object specified by object_name, instead checking whether the object possesses the property.
Parameters
-------
object_name : str
A string describing the name of the object to be found in the image.
visual_property : str
A string describing the simple visual property (e.g., color, shape, material) to be checked.
Examples
-------
>>> # Do the letters have blue color?
>>> def execute_command(image) -> str:
>>> image_patch = ImagePatch(image)
>>> letters_patches = image_patch.find("letters")
>>> # Question assumes only one letter patch
>>> return bool_to_yesno(letters_patches[0].verify_property("letters", "blue"))
"""
return verify_property(self.cropped_image, object_name, property)
def best_text_match(self, option_list: List[str], prefix: str=None) -> str:
"""Returns the string that best matches the image.
Parameters
-------
option_list : str
A list with the names of the different options
prefix : str
A string with the prefixes to append to the options
Examples
-------
>>> # Is the foo gold or white?
>>> def execute_command(image)->str:
>>> image_patch = ImagePatch(image)
>>> foo_patches = image_patch.find("foo")
>>> # Question assumes one foo patch
>>> return foo_patches[0].best_text_match(["gold", "white"])
"""
return best_text_match(self.cropped_image, option_list, prefix)
def simple_query(self, question: str = None) -> str:
"""Returns the answer to a basic question asked about the image. If no question is provided, returns the answer
to "What is this?". The questions are about basic perception, and are not meant to be used for complex reasoning
or external knowledge.
Parameters
-------
question : str
A string describing the question to be asked.
Examples
-------
>>> # Which kind of baz is not fredding?
>>> def execute_command(image) -> str:
>>> image_patch = ImagePatch(image)
>>> baz_patches = image_patch.find("baz")
>>> for baz_patch in baz_patches:
>>> if not baz_patch.verify_property("baz", "fredding"):
>>> return baz_patch.simple_query("What is this baz?")
>>> # What color is the foo?
>>> def execute_command(image) -> str:
>>> image_patch = ImagePatch(image)
>>> foo_patches = image_patch.find("foo")
>>> foo_patch = foo_patches[0]
>>> return foo_patch.simple_query("What is the color?")
>>> # Is the second bar from the left quuxy?
>>> def execute_command(image) -> str:
>>> image_patch = ImagePatch(image)
>>> bar_patches = image_patch.find("bar")
>>> bar_patches.sort(key=lambda x: x.horizontal_center)
>>> bar_patch = bar_patches[1]
>>> return bar_patch.simple_query("Is the bar quuxy?")
"""
return simple_query(self.cropped_image, question)
def compute_depth(self):
"""Returns the median depth of the image crop
Parameters
----------
Returns
-------
float
the median depth of the image crop
Examples
--------
>>> # the bar furthest away
>>> def execute_command(image)->ImagePatch:
>>> image_patch = ImagePatch(image)
>>> bar_patches = image_patch.find("bar")
>>> bar_patches.sort(key=lambda bar: bar.compute_depth())
>>> return bar_patches[-1]
"""
depth_map = compute_depth(self.cropped_image)
return depth_map.median()
def crop(self, left: int, lower: int, right: int, upper: int) -> ImagePatch:
"""Returns a new ImagePatch cropped from the current ImagePatch.
Parameters
-------
left, lower, right, upper : int
The (left/lower/right/upper)most pixel of the cropped image.
-------
"""
return ImagePatch(self.cropped_image, left, lower, right, upper)
def overlaps_with(self, left, lower, right, upper):
"""Returns True if a crop with the given coordinates overlaps with this one,
else False.
Parameters
----------
left, lower, right, upper : int
the (left/lower/right/upper) border of the crop to be checked
Returns
-------
bool
True if a crop with the given coordinates overlaps with this one, else False
Examples
--------
>>> # black foo on top of the qux
>>> def execute_command(image) -> ImagePatch:
>>> image_patch = ImagePatch(image)
>>> qux_patches = image_patch.find("qux")
>>> qux_patch = qux_patches[0]
>>> foo_patches = image_patch.find("black foo")
>>> for foo in foo_patches:
>>> if foo.vertical_center > qux_patch.vertical_center
>>> return foo
"""
return self.left <= right and self.right >= left and self.lower <= upper and self.upper >= lower
def llm_query(self, question: str, long_answer: bool = True) -> str:
'''Answers a text question using GPT-3. The input question is always a formatted string with a variable in it.
Parameters
----------
question: str
the text question to ask. Must not contain any reference to 'the image' or 'the photo', etc.
long_answer: bool
whether to return a short answer or a long answer. Short answers are one or at most two words, very concise.
Long answers are longer, and may be paragraphs and explanations. Defalt is True (so long answer).
Examples
--------
>>> # What is the city this building is in?
>>> def execute_command(image) -> str:
>>> image_patch = ImagePatch(image)
>>> building_patches = image_patch.find("building")
>>> building_patch = building_patches[0]
>>> building_name = building_patch.simple_query("What is the name of the building?")
>>> return building_patch.llm_query(f"What city is {building_name} in?", long_answer=False)
>>> # Who invented this object?
>>> def execute_command(image) -> str:
>>> image_patch = ImagePatch(image)
>>> object_patches = image_patch.find("object")
>>> object_patch = object_patches[0]
>>> object_name = object_patch.simple_query("What is the name of the object?")
>>> return object_patch.llm_query(f"Who invented {object_name}?", long_answer=False)
>>> # Explain the history behind this object.
>>> def execute_command(image) -> str:
>>> image_patch = ImagePatch(image)
>>> object_patches = image_patch.find("object")
>>> object_patch = object_patches[0]
>>> object_name = object_patch.simple_query("What is the name of the object?")
>>> return object_patch.llm_query(f"What is the history behind {object_name}?", long_answer=True)
'''
return llm_query(question, long_answer)
def best_image_match(list_patches: List[ImagePatch], content: List[str], return_index=False) -> Union[ImagePatch, int]:
"""Returns the patch most likely to contain the content.
Parameters
----------
list_patches : List[ImagePatch]
content : List[str]
the object of interest
return_index : bool
if True, returns the index of the patch most likely to contain the object
Returns
-------
int
Patch most likely to contain the object
"""
return best_image_match(list_patches, content, return_index)
def distance(patch_a: ImagePatch, patch_b: ImagePatch) -> float:
"""
Returns the distance between the edges of two ImagePatches. If the patches overlap, it returns a negative distance
corresponding to the negative intersection over union.
Parameters
----------
patch_a : ImagePatch
patch_b : ImagePatch
Examples
--------
# Return the qux that is closest to the foo
>>> def execute_command(image):
>>> image_patch = ImagePatch(image)
>>> qux_patches = image_patch.find('qux')
>>> foo_patches = image_patch.find('foo')
>>> foo_patch = foo_patches[0]
>>> qux_patches.sort(key=lambda x: distance(x, foo_patch))
>>> return qux_patches[0]
"""
return distance(patch_a, patch_b)
def bool_to_yesno(bool_answer: bool) -> str:
return "yes" if bool_answer else "no"
def coerce_to_numeric(string):
"""
This function takes a string as input and returns a float after removing any non-numeric characters.
If the input string contains a range (e.g. "10-15"), it returns the first value in the range.
"""
return coerce_to_numeric(string)
Write a function using Python and the ImagePatch class (above) that could be executed to provide an answer to the query.
Consider the following guidelines:
- Use base Python (comparison, sorting) for basic logical operations, left/right/up/down, math, etc.
- Use the llm_query function to access external information and answer informational questions not concerning the image.
Query: INSERT_QUERY_HERE