Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions datasets/1.1/recipes/data/minimal.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
name,age
Alice,22
Bob,23
John,6
Jane,53
140 changes: 140 additions & 0 deletions datasets/1.1/recipes/minimal_multilingual.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
{
"@context": {
"@language": "en",
"@vocab": "https://schema.org/",
"arrayShape": "cr:arrayShape",
"citeAs": "cr:citeAs",
"column": "cr:column",
"conformsTo": "dct:conformsTo",
"cr": "http://mlcommons.org/croissant/",
"rai": "http://mlcommons.org/croissant/RAI/",
"data": {
"@id": "cr:data",
"@type": "@json"
},
"dataType": {
"@id": "cr:dataType",
"@type": "@vocab"
},
"description": {"@container": "@language"},
"dct": "http://purl.org/dc/terms/",
"examples": {
"@id": "cr:examples",
"@type": "@json"
},
"extract": "cr:extract",
"field": "cr:field",
"fileProperty": "cr:fileProperty",
"fileObject": "cr:fileObject",
"fileSet": "cr:fileSet",
"format": "cr:format",
"includes": "cr:includes",
"isArray": "cr:isArray",
"isLiveDataset": "cr:isLiveDataset",
"jsonPath": "cr:jsonPath",
"key": "cr:key",
"md5": "cr:md5",
"name": {"@container": "@language"},
"parentField": "cr:parentField",
"path": "cr:path",
"recordSet": "cr:recordSet",
"references": "cr:references",
"regex": "cr:regex",
"repeated": "cr:repeated",
"replace": "cr:replace",
"samplingRate": "cr:samplingRate",
"sc": "https://schema.org/",
"separator": "cr:separator",
"source": "cr:source",
"subField": "cr:subField",
"transform": "cr:transform"
},
"@type": "sc:Dataset",
"name": {
"en": "minimal_example_with_multilingual_descriptions",
"de": "minimales_Beispiel_mit_mehrsprachigen_Beschreibungen",
"fr": "exemple_minimal_avec_descriptions_multilingues"
},
"description": {
"en": "This is a minimal example, including the required and the recommended fields in multiple languages.",
"de": "Dies ist ein Minimalbeispiel, das die erforderlichen und die empfohlenen Felder in mehreren Sprachen enthält.",
"fr": "Ceci est un exemple minimal, incluant les champs obligatoires et recommandés dans plusieurs langues."
},
"conformsTo": "http://mlcommons.org/croissant/1.1",
"license": "https://creativecommons.org/licenses/by/4.0/",
"url": "https://example.com/dataset/recipes/minimal-recommended",
"distribution": [
{
"@type": "cr:FileObject",
"@id": "minimal.csv",
"name": "minimal.csv",
"contentUrl": "data/minimal.csv",
"encodingFormat": "text/csv",
"sha256": "48a7c257f3c90b2a3e529ddd2cca8f4f1bd8e49ed244ef53927649504ac55354"
}
],
"recordSet": [
{
"@type": "cr:RecordSet",
"@id": "examples",
"name": {
"en": "examples",
"de": "Beispiele",
"fr": "exemples"
},
"description": {
"en": "Records extracted from the example table, with their schema.",
"de": "Aus der Beispieltabelle extrahierte Datensätze mit ihrem Schema.",
"fr": "Enregistrements extraits de la table d'exemple, avec leur schéma."
},
"field": [
{
"@type": "cr:Field",
"@id": "examples/name",
"name": {
"en": "name",
"de": "Name",
"fr": "nom"
},
"description": {
"en": "The first column contains the name.",
"de": "Die erste Spalte enthält den Namen.",
"fr": "La première colonne contient le nom."
},
"dataType": "sc:Text",
"source": {
"fileObject": {
"@id": "minimal.csv"
},
"extract": {
"column": "name"
}
}
},
{
"@type": "cr:Field",
"@id": "examples/age",
"name": {
"en": "age",
"de": "Alter",
"fr": "âge"
},
"description": {
"en": "The second column contains the age.",
"de": "Die zweite Spalte enthält das Alter.",
"fr": "La deuxième colonne contient l'âge."
},
"dataType": "sc:Integer",
"source": {
"fileObject": {
"@id": "minimal.csv"
},
"extract": {
"column": "age"
}
}
}
]
}
]
}
4 changes: 4 additions & 0 deletions datasets/1.1/recipes/output/examples.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{"examples/name": "Alice", "examples/age": 22}
{"examples/name": "Bob", "examples/age": 23}
{"examples/name": "John", "examples/age": 6}
{"examples/name": "Jane", "examples/age": 53}
12 changes: 6 additions & 6 deletions editor/core/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ class SelectedRecordSet:
class Node:
ctx: mlc.Context = dataclasses.field(default_factory=mlc.Context)
id: str | None = None
name: str | None = None
name: str | dict[str, str] | None = None

def get_name_or_id(self):
if self.ctx.is_v0():
Expand All @@ -141,7 +141,7 @@ def get_name_or_id(self):
class FileObject(Node):
"""FileObject analogue for editor"""

description: str | None = None
description: str | dict[str, str] | None = None
contained_in: list[str] | None = dataclasses.field(default_factory=list)
content_size: str | None = None
content_url: str | None = None
Expand All @@ -156,7 +156,7 @@ class FileSet(Node):
"""FileSet analogue for editor"""

contained_in: list[str] = dataclasses.field(default_factory=list)
description: str | None = None
description: str | dict[str, str] | None = None
encoding_format: str | None = ""
includes: str | None = ""

Expand All @@ -165,7 +165,7 @@ class FileSet(Node):
class Field(Node):
"""Field analogue for editor"""

description: str | None = None
description: str | dict[str, str] | None = None
data_types: str | list[str] | None = None
equivalentProperty: str | list[str] | None = None
source: mlc.Source | None = None
Expand All @@ -178,7 +178,7 @@ class RecordSet(Node):

data: list[Any] | None = None
data_types: list[str] | None = None
description: str | None = None
description: str | dict[str, str] | None = None
is_enumeration: bool | None = None
key: str | list[str] | None = None
fields: list[Field] = dataclasses.field(default_factory=list)
Expand All @@ -188,7 +188,7 @@ class RecordSet(Node):
class Metadata(Node):
"""main croissant data object, helper functions exist to load and unload this into the mlcroissant version"""

description: str | None = None
description: str | dict[str, str] | None = None
cite_as: str | None = None
creators: list[mlc.Person] = dataclasses.field(default_factory=list)
date_published: datetime.datetime | None = None
Expand Down
8 changes: 6 additions & 2 deletions python/mlcroissant/mlcroissant/_src/core/dataclasses.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,10 @@ def jsonld_field(
**kwargs,
):
"""Overloads dataclasses.field with specific attributes."""
if cardinality not in ["ONE", "MANY"]:
raise ValueError(f"cardinality should be ONE or MANY. Got {cardinality}")
if cardinality not in ["ONE", "MANY", "LANGUAGE-TAGGED"]:
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question: do we need ONE-LANGUAGE-TAGGED and MANY-LANGUAGE-TAGGED?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, here "LANGUAGE-TAGGED" is short for "either one value or a dict of one or more language-tagged values"

raise ValueError(
f"cardinality should be ONE, MANY or LANGUAGE-TAGGED. Got {cardinality}"
)
if input_types is None:
input_types = []
if exclusive_with is None:
Expand Down Expand Up @@ -232,6 +234,8 @@ def _check_types(cls_or_instance, field: dataclasses.Field, metadata: Metadata)
expected_type = Union[tuple(types)] # type: ignore
if metadata["cardinality"] == "MANY":
expected_type = list[expected_type] # type: ignore
elif metadata["cardinality"] == "LANGUAGE-TAGGED":
expected_type = expected_type | dict[str, expected_type] # type: ignore
if field.default != dataclasses.MISSING:
expected_type = Union[expected_type, type(field.default)] # type: ignore

Expand Down
30 changes: 24 additions & 6 deletions python/mlcroissant/mlcroissant/_src/core/json_ld.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,20 +149,22 @@ def box_singleton_list(element: Any) -> list[Any] | None:
return [element]


def recursively_populate_jsonld(entry_node: Json, id_to_node: dict[str, Json]) -> Any:
def recursively_populate_jsonld(
entry_node: Json, id_to_node: dict[str, Json], context: dict[str, Json]
) -> Any:
"""Changes in place `entry_node` with its children."""
if "@value" in entry_node:
if isinstance(entry_node, dict) and "@value" in entry_node:
if entry_node.get("@type") == namespace.RDF.JSON:
# Stringified JSON is loaded as a dict.
return json.loads(entry_node["@value"])
else:
# Other values are loaded as is.
return entry_node["@value"]
elif len(entry_node) == 1 and "@id" in entry_node:
elif isinstance(entry_node, dict) and len(entry_node) == 1 and "@id" in entry_node:
node_id = entry_node["@id"]
if node_id in id_to_node:
entry_node = id_to_node[node_id]
return recursively_populate_jsonld(entry_node, id_to_node)
return recursively_populate_jsonld(entry_node, id_to_node, context)
else:
return entry_node
elif isinstance(entry_node, (str, float, int, bool)):
Expand All @@ -177,7 +179,23 @@ def recursively_populate_jsonld(entry_node: Json, id_to_node: dict[str, Json]) -
entry_node[key] = term.URIRef(value[0])
elif isinstance(value, list):
del entry_node[key]
value = [recursively_populate_jsonld(child, id_to_node) for child in value]
if key in ("https://schema.org/name", "https://schema.org/description"):
if (
len(value) == 1
and isinstance(value[0], dict)
and "@value" in value[0]
and value[0].get("@language", context["@language"])
== context["@language"]
):
value = value[0]["@value"]
elif all(isinstance(v, dict) and "@language" in v for v in value):
value = {d["@language"]: d["@value"] for d in value}
entry_node[term.URIRef(key)] = value
continue
value = [
recursively_populate_jsonld(child, id_to_node, context)
for child in value
]
node_type = entry_node.get("@type", "")
key, node_type = term.URIRef(key), term.URIRef(node_type)
if (key, node_type) in _KEYS_WITH_LIST:
Expand Down Expand Up @@ -237,7 +255,7 @@ def expand_jsonld(data: Json, ctx: Context) -> Json:
for node in nodes:
node_id = node.get("@id")
id_to_node[node_id] = node
recursively_populate_jsonld(entry_node, id_to_node)
recursively_populate_jsonld(entry_node, id_to_node, context)
entry_node["@context"] = make_context(**context)
return entry_node

Expand Down
20 changes: 20 additions & 0 deletions python/mlcroissant/mlcroissant/_src/core/json_ld_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import pytest

from mlcroissant._src.core import constants
from mlcroissant._src.core.context import Context
from mlcroissant._src.core.context import CroissantVersion
from mlcroissant._src.core.rdf import make_context
from mlcroissant._src.datasets import Dataset

Expand Down Expand Up @@ -65,3 +67,21 @@ def test_make_context():
"transform": "cr:transform",
"foo": "bar",
}


def test_expand_and_reduce_language_tagged():
ctx = Context(conforms_to=CroissantVersion.V_1_1)
dataset = Dataset({
"@context": make_context(ctx),
"@type": "sc:Dataset",
"conformsTo": CroissantVersion.V_1_1.value,
"name": {"en": "a", "fr": "b"},
"description": [
{"@language": "en", "@value": "A"},
{"@language": "de", "@value": "B"},
],
})
metadata = dataset.metadata
actual = metadata.to_json()
assert actual["name"] == {"en": "a", "fr": "b"}
assert actual["description"] == {"en": "A", "de": "B"}
6 changes: 6 additions & 0 deletions python/mlcroissant/mlcroissant/_src/core/rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ def make_context(ctx=None, **kwargs):
"data": {"@id": "cr:data", "@type": "@json"},
"dataType": {"@id": "cr:dataType", "@type": "@vocab"},
"dct": "http://purl.org/dc/terms/",
"description": (
{"@container": "@language"} if ctx is not None and ctx.is_v1_1() else None
),
"examples": {"@id": "cr:examples", "@type": "@json"},
"extract": "cr:extract",
"field": "cr:field",
Expand All @@ -47,6 +50,9 @@ def make_context(ctx=None, **kwargs):
"jsonPath": "cr:jsonPath",
"key": "sc:key" if ctx is not None and ctx.is_v0() else "cr:key",
"md5": "sc:md5" if ctx is not None and ctx.is_v0() else "cr:md5",
"name": (
{"@container": "@language"} if ctx is not None and ctx.is_v1_1() else None
),
"parentField": "cr:parentField",
"path": "cr:path",
"recordSet": "cr:recordSet",
Expand Down
12 changes: 12 additions & 0 deletions python/mlcroissant/mlcroissant/_src/datasets_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ def test_static_analysis_1_0(folder):
[
"mlfield_bad_array_definition",
"mlfield_bad_array_shape",
"multilingual_fields",
],
)
def test_static_analysis_1_1(folder):
Expand Down Expand Up @@ -236,6 +237,17 @@ def test_hermetic_loading_1_0(dataset_name, record_set_name, num_records, filter
)


# Hermetic test cases for croissant 1.1 only.
@pytest.mark.parametrize(
["dataset_name", "record_set_name", "num_records"],
[
["recipes/minimal_multilingual.json", "examples", -1],
],
)
def test_hermetic_loading_1_1(dataset_name, record_set_name, num_records):
load_records_and_test_equality("1.1", dataset_name, record_set_name, num_records)


@parametrize_version()
def test_raises_when_the_record_set_does_not_exist(version):
dataset_folder = constants.DATASETS_FOLDER / version / "titanic"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -242,7 +242,10 @@ def _get_result(row):
value = _cast_value(self.node.ctx, value, field.data_type)

if self.node.ctx.is_v0():
result[field.name] = value
# v0 only supports str names
result[field.name] = (
value # pytype: disable=container-type-mismatch
)
else:
if field in self.node.fields:
result[field.id] = value
Expand Down
Loading