2
2
from oumi .core .registry import register_dataset
3
3
from oumi .core .types .turn import Conversation , Message , Role , Type
4
4
5
+ _COCO_COLUMN_SENTENCES = "sentences"
6
+ _COCO_COLUMN_RAW = "raw"
7
+ _COCO_COLUMN_IMAGE = "image"
8
+ _COCO_COLUMN_PATH = "path"
9
+ _COCO_COLUMN_BYTES = "bytes"
10
+
5
11
6
12
@register_dataset ("coco_captions" )
7
13
class COCOCaptionsDataset (VisionLanguageSftDataset ):
@@ -12,43 +18,43 @@ def transform_conversation(self, example: dict) -> Conversation:
12
18
"""Transform a single conversation example into a Conversation object."""
13
19
input_text = self .default_prompt
14
20
15
- for required_key in ("sentences" , "image" ):
21
+ for required_key in (_COCO_COLUMN_SENTENCES , _COCO_COLUMN_IMAGE ):
16
22
if required_key not in example :
17
23
raise ValueError (
18
24
"Training example doesn't contain '{required_key}' key. "
19
25
f"Available keys: { example .keys ()} ."
20
26
)
21
27
22
- if "raw" not in example ["sentences" ]:
28
+ if _COCO_COLUMN_RAW not in example [_COCO_COLUMN_SENTENCES ]:
23
29
raise ValueError (
24
- "Training example doesn't contain 'sentences.raw' key. "
25
- f"Available keys under 'sentences.': { example ['sentences' ].keys ()} ."
30
+ "Training example doesn't contain 'sentences.raw' key. Available keys "
31
+ f"under 'sentences.': { example [_COCO_COLUMN_SENTENCES ].keys ()} ."
26
32
)
27
- output_text = example ["sentences" ][ "raw" ]
33
+ output_text = example [_COCO_COLUMN_SENTENCES ][ _COCO_COLUMN_RAW ]
28
34
29
35
messages = [Message (role = Role .USER , content = input_text )]
30
36
31
- if "bytes" in example ["image" ]:
37
+ if _COCO_COLUMN_BYTES in example [_COCO_COLUMN_IMAGE ]:
32
38
messages .append (
33
39
Message (
34
40
role = Role .USER ,
35
- binary = example ["image" ][ "bytes" ],
41
+ binary = example [_COCO_COLUMN_IMAGE ][ _COCO_COLUMN_BYTES ],
36
42
type = Type .IMAGE_BINARY ,
37
43
)
38
44
)
39
- elif "path" in example ["image" ]:
45
+ elif _COCO_COLUMN_PATH in example [_COCO_COLUMN_IMAGE ]:
40
46
messages .append (
41
47
Message (
42
48
role = Role .USER ,
43
- content = example ["image" ][ "path" ],
49
+ content = example [_COCO_COLUMN_IMAGE ][ _COCO_COLUMN_PATH ],
44
50
type = Type .IMAGE_PATH ,
45
51
)
46
52
)
47
53
else :
48
54
raise ValueError (
49
55
"Training example contains none of required keys: "
50
56
"'image.bytes', 'image.path'. "
51
- f"Available keys under 'image.': { example ['image' ].keys ()} ."
57
+ f"Available keys under 'image.': { example [_COCO_COLUMN_IMAGE ].keys ()} ."
52
58
)
53
59
54
60
messages .append (Message (role = Role .ASSISTANT , content = output_text ))
0 commit comments