Skip to content

Commit 7ba4477

Browse files
shchekleinlhoestq
andauthored
fix(hf): use proper source when we create a file entry (#555)
* fix(hf): use proper source when we create a file entry * add more details to the unsupported PyArrow type message * add example: HF -> OpenAI -> HF -> analyze * use HF inference endpoint Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> * use to_parquet / from_parquet to preserve schema * add a bit of comments, fix them * use HF_TOKEN to run e2e HF example --------- Co-authored-by: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com>
1 parent a516c94 commit 7ba4477

File tree

4 files changed

+63
-1
lines changed

4 files changed

+63
-1
lines changed

.github/workflows/tests.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -152,4 +152,6 @@ jobs:
152152
run: uv pip install nox --system
153153

154154
- name: Run examples
155+
env:
156+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
155157
run: nox -s examples -p ${{ matrix.pyv }} -- -m "${{ matrix.group }}"
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
from huggingface_hub import InferenceClient
2+
3+
from datachain import C, DataChain, DataModel
4+
5+
PROMPT = """
6+
Was this dialog successful? Put result as a single word: Success or Failure.
7+
Explain the reason in a few words.
8+
"""
9+
10+
11+
class DialogEval(DataModel):
12+
result: str
13+
reason: str
14+
15+
16+
# DataChain function to evaluate dialog.
17+
# DataChain is using types for inputs, results to automatically infer schema.
18+
def eval_dialog(user_input: str, bot_response: str) -> DialogEval:
19+
client = InferenceClient("meta-llama/Llama-3.1-70B-Instruct")
20+
21+
completion = client.chat_completion(
22+
messages=[
23+
{
24+
"role": "user",
25+
"content": f"{PROMPT}\n\nUser: {user_input}\nBot: {bot_response}",
26+
},
27+
],
28+
response_format={"type": "json", "value": DialogEval.model_json_schema()},
29+
)
30+
31+
message = completion.choices[0].message
32+
try:
33+
return DialogEval.model_validate_json(message.content)
34+
except ValueError:
35+
return DialogEval(result="Error", reason="Failed to parse response.")
36+
37+
38+
# Run HF inference in parallel for each example.
39+
# Get result as Pydantic model that DataChain can understand and serialize it.
40+
# Save to HF as Parquet. Dataset can be previewed here:
41+
# https://huggingface.co/datasets/dvcorg/test-datachain-llm-eval/viewer
42+
(
43+
DataChain.from_csv(
44+
"hf://datasets/infinite-dataset-hub/MobilePlanAssistant/data.csv"
45+
)
46+
.settings(parallel=10)
47+
.map(response=eval_dialog)
48+
.to_parquet("hf://datasets/dvcorg/test-datachain-llm-eval/data.parquet")
49+
)
50+
51+
# Read it back to filter and show.
52+
# It restores the Pydantic model from Parquet under the hood.
53+
(
54+
DataChain.from_parquet(
55+
"hf://datasets/dvcorg/test-datachain-llm-eval/data.parquet", source=False
56+
)
57+
.filter(C("response.result") == "Failure")
58+
.show(3)
59+
)

src/datachain/client/hf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ def create_fs(cls, **kwargs) -> HfFileSystem:
2323

2424
def info_to_file(self, v: dict[str, Any], path: str) -> File:
2525
return File(
26+
source=self.uri,
2627
path=path,
2728
size=v["size"],
2829
version=v["last_commit"].oid,

src/datachain/lib/arrow.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -175,7 +175,7 @@ def arrow_type_mapper(col_type: pa.DataType, column: str = "") -> type: # noqa:
175175
return dict
176176
if isinstance(col_type, pa.lib.DictionaryType):
177177
return arrow_type_mapper(col_type.value_type) # type: ignore[return-value]
178-
raise TypeError(f"{col_type!r} datatypes not supported")
178+
raise TypeError(f"{col_type!r} datatypes not supported, column: {column}")
179179

180180

181181
def _nrows_file(file: File, nrows: int) -> str:

0 commit comments

Comments
 (0)