Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
"""
Convert filtered samples to rejection sampling training data format.

This script converts the filtered correct samples into the format required
for rejection sampling training, similar to imagegen-cot-reward dataset.
"""

import os
import json
import argparse
from typing import List, Dict
from loguru import logger


def convert_to_rejection_sampling_format(
filtered_samples_path: str,
output_path: str,
data_root: str,
task_instruction_template: str = None,
):
"""
Convert filtered samples to rejection sampling training format.

:param filtered_samples_path: Path to filtered samples JSON file
:type filtered_samples_path: str
:param output_path: Path to save converted training data
:type output_path: str
:param data_root: Root directory of the dataset (for image paths)
:type data_root: str
:param task_instruction_template: Template for task instruction
:type task_instruction_template: str, optional
:return: List of training data items in imagegen-cot-reward format
:rtype: List[Dict]
"""
logger.info(f"Loading filtered samples from {filtered_samples_path}")

with open(filtered_samples_path, 'r', encoding='utf-8') as f:
filtered_samples = json.load(f)

logger.info(f"Loaded {len(filtered_samples)} filtered samples")

# Default task instruction template
if task_instruction_template is None:
task_instruction_template = """Given a caption and two images generated based on this caption, please analyze in detail the two provided images.
Evaluate them on various dimensions such as semantic consistency (how closely the image content aligns with the caption),
aesthetics (composition, color usage, artistic expression), authenticity (realism and attention to detail),
and any other factors you deem relevant. For each evaluation dimension,
provide a score between 1-10 for both images (e.g., Image 1: 8/10, Image 2: 6/10) and provide a concise rationale for the score.
Calculate the total score for each image by summing all dimension scores.
Use a chain-of-thought process to detail your reasoning steps, and enclose all your detailed reasoning within <think> and </think> tags.
Then, in the <answer> tag, output exactly one of the following strings: 'Image 1 is better' or 'Image 2 is better' based on the total scores.
No additional text is allowed in the <answer> section.
Example output format:
<think>
Semantic consistency: Image 1 (9/10) - ...; Image 2 (7/10) - ...
Aesthetics: Image 2 (8/10) - ...; Image 1 (8/10) - ...
Authenticity: Image 1 (8/10) - ...; Image 2 (5/10) - ...
[Additional dimensions if any]: Image 2 (8/10) - ...; Image 1 (6/10) - ...
Total score:
Image 1: 9+8+8+6=31
Image 2: 7+8+5+8=28
</think>
<answer>Image 1 is better</answer>
Note: In the example above, scores and the final answer are placeholders meant only to demonstrate the format. Your actual evaluation should be based on the quality of two given images.
Your task is provided as follows:
Text Caption: {prompt}"""

training_data = []

for idx, sample in enumerate(filtered_samples):
prompt = sample['prompt']
path1 = sample['path1']
path2 = sample['path2']
preference = sample['preference']
generated_text = sample.get('generated_text', '')
reasoning = sample.get('reasoning', '')

# Determine which image is better based on preference
# In HPDv3, path1 is the preferred path, path2 is the rejected path
# preference "A" means Image 1 (which is path1) is better
# preference "B" means Image 2 (which is path2) is better, but this means path2 was randomly chosen as Image 1
# Actually, in HPDv3GRMHandler, preference A means image0 (first shown) is preferred
# So we need to check which path corresponds to which image

# Since we stored preferred_path and rejected_path, we know:
# - preferred_path (path1) should be the better one
# - rejected_path (path2) should be the worse one
# But the handler randomly assigns them to Image 1 or Image 2

# For training data, we always use: Image 1 = preferred, Image 2 = rejected
# This ensures consistency
answer = "Image 1 is better" if preference == "A" else "Image 2 is better"
image1_path = path1 # preferred
image2_path = path2 # rejected

# Build the response with CoT reasoning
# Note: We use <think> instead of <think> to match the instruction format
if reasoning:
# Use the extracted reasoning from inference
# Clean up the reasoning text
reasoning_clean = reasoning.strip()
response = f"<think>\n{reasoning_clean}\n</think>\n<answer>{answer}</answer>"
else:
# If no reasoning was extracted, create a placeholder
# In practice, you might want to regenerate this or use a template
response = f"<think>\nBased on the evaluation of semantic consistency, aesthetics, and authenticity, I will compare the two images.\n</think>\n<answer>{answer}</answer>"

# Build conversations format
task_instruction = task_instruction_template.format(prompt=prompt)

# Create training data item in imagegen-cot-reward format
training_item = {
"conversations": [
{
"from": "human",
"value": task_instruction
},
{
"from": "gpt",
"value": response
}
],
"images": [
image1_path if os.path.isabs(image1_path) else os.path.join(data_root, image1_path),
image2_path if os.path.isabs(image2_path) else os.path.join(data_root, image2_path),
]
}

training_data.append(training_item)

if (idx + 1) % 100 == 0:
logger.info(f"Converted {idx + 1}/{len(filtered_samples)} samples")

# Save training data
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(training_data, f, indent=2, ensure_ascii=False)

logger.info(f"Saved {len(training_data)} training samples to {output_path}")

return training_data


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Convert filtered samples to rejection sampling training format")
parser.add_argument("--filtered_samples_path", type=str, required=True,
help="Path to filtered samples JSON file")
parser.add_argument("--output_path", type=str, required=True,
help="Path to save converted training data")
parser.add_argument("--data_root", type=str, required=True,
help="Root directory of the dataset (for image paths)")
parser.add_argument("--task_instruction", type=str, default=None,
help="Task instruction template (optional)")

args = parser.parse_args()

convert_to_rejection_sampling_format(
filtered_samples_path=args.filtered_samples_path,
output_path=args.output_path,
data_root=args.data_root,
task_instruction_template=args.task_instruction,
)

Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
"""
Convert filtered samples to rejection sampling training data format for T2V.

This script converts the filtered correct samples into the format required
for rejection sampling training, similar to imagegen-cot-reward dataset but for videos.
"""

import os
import json
import argparse
from typing import List, Dict
from loguru import logger


def convert_to_rejection_sampling_format(
filtered_samples_path: str,
output_path: str,
task_instruction_template: str = None,
video_fps: float = 2.0,
):
"""
Convert filtered samples to rejection sampling training format for T2V.

:param filtered_samples_path: Path to filtered samples JSON file
:type filtered_samples_path: str
:param output_path: Path to save converted training data
:type output_path: str
:param task_instruction_template: Template for task instruction
:type task_instruction_template: str, optional
:param video_fps: FPS for video processing
:type video_fps: float
:return: List of training data items in imagegen-cot-reward format (for videos)
:rtype: List[Dict]
"""
logger.info(f"Loading filtered samples from {filtered_samples_path}")

with open(filtered_samples_path, 'r', encoding='utf-8') as f:
filtered_samples = json.load(f)

logger.info(f"Loaded {len(filtered_samples)} filtered samples")

# Default task instruction template for T2V
if task_instruction_template is None:
task_instruction_template = """Given a caption and two videos generated based on this caption, please analyze in detail the two provided videos.
Evaluate them on various dimensions such as semantic consistency (how closely the video content aligns with the caption), temporal coherence (smoothness and logical flow of motion across frames), authenticity (realism and attention to detail), and any other factors you deem relevant.
For each evaluation dimension, provide a score between 1-10 for both videos (e.g., Video 1: 8/10, Video 2: 6/10) and provide a concise rationale for the score.
Calculate the total score for each video by summing all dimension scores.
Use a chain-of-thought process to detail your reasoning steps, and enclose all your detailed reasoning within <think> and </think> tags. Then, in the <answer> tag, output exactly one of the following strings:
'Video 1 is better' or 'Video 2 is better' based on the total scores. No additional text is allowed in the <answer> section.
Example output format:
<think>
1. Semantic consistency: Video 1 (9/10) - ...; Video 2 (7/10) - ...
2. Temporal coherence: Video 1 (8/10) - ...; Video 2 (6/10) - ...
3. Authenticity: Video 1 (7/10) - ...; Video 2 (5/10) - ...
...
[Additional dimensions if any]: Video 2 (8/10) - ...; Video 1 (6/10) - ...
Total score:
Video 1: 9+8+7+6=30
Video 2: 7+6+5+8=26
</think>
<answer>Video 1 is better</answer>

Note: In the example above, scores and the final answer are placeholders meant only to demonstrate the format. Your actual evaluation should be based on the quality of two given videos.
Your task is provided as follows:
Text Caption: {prompt}"""

training_data = []

for idx, sample in enumerate(filtered_samples):
prompt = sample['prompt']
path1 = sample['path1']
path2 = sample['path2']
preference = sample['preference']
generated_text = sample.get('generated_text', '')
reasoning = sample.get('reasoning', '')

# Determine which video is better based on preference
# preference "A" means Video 1 is better
# preference "B" means Video 2 is better
# For training data, we always use: Video 1 = preferred, Video 2 = rejected
# This ensures consistency
answer = "Video 1 is better" if preference == "A" else "Video 2 is better"
video1_path = path1 if preference == "A" else path2 # preferred
video2_path = path2 if preference == "A" else path1 # rejected

# Build the response with CoT reasoning
if reasoning:
# Use the extracted reasoning from inference
# Clean up the reasoning text
reasoning_clean = reasoning.strip()
response = f"<think>\n{reasoning_clean}\n</think>\n<answer>{answer}</answer>"
else:
# If no reasoning was extracted, create a placeholder
response = f"<think>\nBased on the evaluation of semantic consistency, temporal coherence, and authenticity, I will compare the two videos.\n</think>\n<answer>{answer}</answer>"

# Build conversations format
task_instruction = task_instruction_template.format(prompt=prompt)

# Create training data item in imagegen-cot-reward format (but for videos)
# We use "images" field name to be compatible with ImageGenCoTRewardHandler
# but store video paths - the handler will need to be modified to support videos
# For now, we store relative paths from data_root
training_item = {
"conversations": [
{
"from": "human",
"value": task_instruction
},
{
"from": "gpt",
"value": response
}
],
"images": [
video1_path if os.path.isabs(video1_path) else video1_path,
video2_path if os.path.isabs(video2_path) else video2_path,
],
"video_fps": video_fps, # Store FPS for video processing
}

training_data.append(training_item)

if (idx + 1) % 100 == 0:
logger.info(f"Converted {idx + 1}/{len(filtered_samples)} samples")

# Save training data
os.makedirs(os.path.dirname(output_path), exist_ok=True)
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(training_data, f, indent=2, ensure_ascii=False)

logger.info(f"Saved {len(training_data)} training samples to {output_path}")

return training_data


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Convert filtered samples to rejection sampling training format for T2V")
parser.add_argument("--filtered_samples_path", type=str, required=True,
help="Path to filtered samples JSON file")
parser.add_argument("--output_path", type=str, required=True,
help="Path to save converted training data")
parser.add_argument("--task_instruction", type=str, default=None,
help="Task instruction template (optional)")
parser.add_argument("--video_fps", type=float, default=2.0,
help="FPS for video processing")

args = parser.parse_args()

convert_to_rejection_sampling_format(
filtered_samples_path=args.filtered_samples_path,
output_path=args.output_path,
task_instruction_template=args.task_instruction,
video_fps=args.video_fps,
)

Loading