Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions scripts/goldgen/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
"""Synthetic eval gold-set generation scaffold."""

203 changes: 203 additions & 0 deletions scripts/goldgen/buckets.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
{
"buckets": [
{
"name": "chat_en",
"domain": "chat",
"language": "en",
"prompt_id": "goldgen-chat-en-v1",
"template": "candidate.md",
"description": "English chat messages with personal, place, date, and task entities.",
"required_cases": ["multi-turn chat", "one entity repeated twice", "negative query types"]
},
{
"name": "chat_ja",
"domain": "chat",
"language": "ja",
"prompt_id": "goldgen-chat-ja-v1",
"template": "candidate.md",
"description": "Japanese chat messages including casual phrasing and strict entity spans.",
"required_cases": ["令和 era dates", "田中さん honorific-attached names", "kana/kanji variants", "東京都 address form", "ja/en code switch", "emoji-heavy chat"]
},
{
"name": "chat_zh",
"domain": "chat",
"language": "zh",
"prompt_id": "goldgen-chat-zh-v1",
"template": "candidate.md",
"description": "Chinese chat messages with dates, locations, people, and reminders.",
"required_cases": ["multi-turn chat", "relative dates", "negative examples"]
},
{
"name": "chat_ko",
"domain": "chat",
"language": "ko",
"prompt_id": "goldgen-chat-ko-v1",
"template": "candidate.md",
"description": "Korean chat messages with dates, locations, people, and reminders.",
"required_cases": ["multi-turn chat", "relative dates", "negative examples"]
},
{
"name": "notes_en",
"domain": "notes",
"language": "en",
"prompt_id": "goldgen-notes-en-v1",
"template": "candidate.md",
"description": "English personal notes with dense but natural entities.",
"required_cases": ["names", "locations", "dates", "activities"]
},
{
"name": "notes_ja",
"domain": "notes",
"language": "ja",
"prompt_id": "goldgen-notes-ja-v1",
"template": "candidate.md",
"description": "Japanese personal notes with hard Japanese entity boundary cases.",
"required_cases": ["令和 era dates", "honorific-attached names", "kana/kanji variants", "都道府県 addresses"]
},
{
"name": "notes_zh",
"domain": "notes",
"language": "zh",
"prompt_id": "goldgen-notes-zh-v1",
"template": "candidate.md",
"description": "Chinese personal notes with strict span boundaries.",
"required_cases": ["names", "locations", "dates", "activities"]
},
{
"name": "notes_ko",
"domain": "notes",
"language": "ko",
"prompt_id": "goldgen-notes-ko-v1",
"template": "candidate.md",
"description": "Korean personal notes with strict span boundaries.",
"required_cases": ["names", "locations", "dates", "activities"]
},
{
"name": "journal_en",
"domain": "journal",
"language": "en",
"prompt_id": "goldgen-journal-en-v1",
"template": "candidate.md",
"description": "English journal entries with emotions, goals, people, and dates.",
"required_cases": ["emotion", "goal", "relationship reference", "negative query types"]
},
{
"name": "journal_ja",
"domain": "journal",
"language": "ja",
"prompt_id": "goldgen-journal-ja-v1",
"template": "candidate.md",
"description": "Japanese journal entries with Oneiron-adjacent entity types and JA hard cases.",
"required_cases": ["令和 era dates", "kana/kanji variants", "emoji-heavy", "relationship reference"]
},
{
"name": "journal_zh",
"domain": "journal",
"language": "zh",
"prompt_id": "goldgen-journal-zh-v1",
"template": "candidate.md",
"description": "Chinese journal entries with emotions, goals, people, and dates.",
"required_cases": ["emotion", "goal", "relationship reference", "negative query types"]
},
{
"name": "journal_ko",
"domain": "journal",
"language": "ko",
"prompt_id": "goldgen-journal-ko-v1",
"template": "candidate.md",
"description": "Korean journal entries with emotions, goals, people, and dates.",
"required_cases": ["emotion", "goal", "relationship reference", "negative query types"]
},
{
"name": "reminders_en",
"domain": "reminders",
"language": "en",
"prompt_id": "goldgen-reminders-en-v1",
"template": "candidate.md",
"description": "English reminders with dates, people, locations, and activities.",
"required_cases": ["imperative reminders", "relative dates", "negative examples"]
},
{
"name": "reminders_ja",
"domain": "reminders",
"language": "ja",
"prompt_id": "goldgen-reminders-ja-v1",
"template": "candidate.md",
"description": "Japanese reminders covering the plan's JA-specific hard cases.",
"required_cases": ["令和 era dates", "honorific-attached names", "kana/kanji variants", "都道府県 addresses", "ja/en code switch", "emoji-heavy"]
},
{
"name": "reminders_zh",
"domain": "reminders",
"language": "zh",
"prompt_id": "goldgen-reminders-zh-v1",
"template": "candidate.md",
"description": "Chinese reminders with dates, people, locations, and activities.",
"required_cases": ["imperative reminders", "relative dates", "negative examples"]
},
{
"name": "reminders_ko",
"domain": "reminders",
"language": "ko",
"prompt_id": "goldgen-reminders-ko-v1",
"template": "candidate.md",
"description": "Korean reminders with dates, people, locations, and activities.",
"required_cases": ["imperative reminders", "relative dates", "negative examples"]
},
{
"name": "ja_era_dates",
"domain": "journal",
"language": "ja",
"prompt_id": "goldgen-ja-era-dates-v1",
"template": "candidate.md",
"description": "Japanese examples focused on Japanese era date expressions.",
"required_cases": ["令和5年", "平成31年", "昭和64年"]
},
{
"name": "ja_honorific_names",
"domain": "chat",
"language": "ja",
"prompt_id": "goldgen-ja-honorific-names-v1",
"template": "candidate.md",
"description": "Japanese examples where names appear attached to honorifics.",
"required_cases": ["田中さん", "佐藤先生", "山田部長"]
},
{
"name": "ja_kana_kanji_variants",
"domain": "notes",
"language": "ja",
"prompt_id": "goldgen-ja-kana-kanji-v1",
"template": "candidate.md",
"description": "Japanese examples containing kana and kanji variants for the same entity.",
"required_cases": ["東京/とうきょう", "大阪/おおさか", "名前 in kana and kanji"]
},
{
"name": "ja_prefecture_addresses",
"domain": "reminders",
"language": "ja",
"prompt_id": "goldgen-ja-prefecture-addresses-v1",
"template": "candidate.md",
"description": "Japanese address examples in 都道府県 form.",
"required_cases": ["東京都", "京都府", "北海道", "大阪府"]
},
{
"name": "ja_en_code_switch",
"domain": "chat",
"language": "ja",
"prompt_id": "goldgen-ja-en-code-switch-v1",
"template": "candidate.md",
"description": "Japanese-English code-switched examples.",
"required_cases": ["meeting", "deadline", "Slack", "カフェ"]
},
{
"name": "ja_emoji_heavy",
"domain": "chat",
"language": "ja",
"prompt_id": "goldgen-ja-emoji-heavy-v1",
"template": "candidate.md",
"description": "Emoji-heavy Japanese chat examples.",
"required_cases": ["emoji adjacent to entity", "emoji between entities", "casual chat"]
}
],
"default_negative_types": ["ORG", "PRODUCT", "WORK_OF_ART", "EVENT", "MONEY"]
}
Loading