Skip to content

Commit 994e38b

Browse files
committed
Update vineyard notebook
Signed-off-by: trafalgarzzz <trafalgarz@outlook.com>
1 parent 9d242dc commit 994e38b

File tree

1 file changed

+74
-14
lines changed

1 file changed

+74
-14
lines changed

examples/03_dataflow_with_vineyard/vineyard.ipynb

Lines changed: 74 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -63,12 +63,50 @@
6363
"execution_count": null,
6464
"metadata": {},
6565
"outputs": [],
66+
"source": [
67+
"%pip install oss2 numpy pandas"
68+
]
69+
},
70+
{
71+
"cell_type": "code",
72+
"execution_count": 1,
73+
"metadata": {},
74+
"outputs": [],
75+
"source": [
76+
"import os\n",
77+
"os.environ[\"OSS_ACCESS_KEY_ID\"] = \"<YOUR_ACCESS_KEY>\"\n",
78+
"os.environ[\"OSS_ACCESS_KEY_SECRET\"] = \"<YOUR_ACCESS_SECRET>\""
79+
]
80+
},
81+
{
82+
"cell_type": "code",
83+
"execution_count": 2,
84+
"metadata": {},
85+
"outputs": [
86+
{
87+
"name": "stdout",
88+
"output_type": "stream",
89+
"text": [
90+
"DataFrame initialized.\n"
91+
]
92+
},
93+
{
94+
"data": {
95+
"text/plain": [
96+
"<oss2.models.PutObjectResult at 0x7f8e7b9b1de0>"
97+
]
98+
},
99+
"execution_count": 2,
100+
"metadata": {},
101+
"output_type": "execute_result"
102+
}
103+
],
66104
"source": [
67105
"import numpy as np\n",
68106
"import pandas as pd\n",
69107
"\n",
70-
"# 生成大小约为22G的dataframe\n",
71-
"num_rows = 6000 * 10000\n",
108+
"# Fake data\n",
109+
"num_rows = 600 * 1000\n",
72110
"df = pd.DataFrame({\n",
73111
" 'Id': np.random.randint(1, 100000, num_rows),\n",
74112
" 'MSSubClass': np.random.randint(20, 201, size=num_rows),\n",
@@ -118,13 +156,15 @@
118156
" 'SalePrice': np.random.randint(50000, 800001, num_rows),\n",
119157
"})\n",
120158
"\n",
159+
"print(\"DataFrame initialized.\")\n",
160+
"\n",
121161
"import oss2\n",
122162
"import io\n",
123163
"from oss2.credentials import EnvironmentVariableCredentialsProvider\n",
124164
"# 请将您的 OSS accessKeyID 和 accessKeySecret 分别设置成环境变量 OSS_ACCESS_KEY_ID 和 OSS_ACCESS_KEY_SECRET\n",
125165
"auth = oss2.ProviderAuth(EnvironmentVariableCredentialsProvider())\n",
126166
"# 请将 OSS_ENDPOINT 和 BUCKET_NAME 替换为您的 OSS Endpoint 和 Bucket\n",
127-
"bucket = oss2.Bucket(auth, 'OSS_ENDPOINT', 'BUCKET_NAME')\n",
167+
"bucket = oss2.Bucket(auth, 'oss-cn-beijing.aliyuncs.com', 'fluid-demo')\n",
128168
"\n",
129169
"bytes_buffer = io.BytesIO()\n",
130170
"df.to_pickle(bytes_buffer)\n",
@@ -140,7 +180,7 @@
140180
},
141181
{
142182
"cell_type": "code",
143-
"execution_count": null,
183+
"execution_count": 3,
144184
"metadata": {},
145185
"outputs": [],
146186
"source": [
@@ -190,18 +230,14 @@
190230
},
191231
{
192232
"cell_type": "code",
193-
"execution_count": null,
233+
"execution_count": 4,
194234
"metadata": {},
195235
"outputs": [],
196236
"source": [
197237
"from kubernetes.client import models as k8s_models\n",
198238
"# 定义任务运行模版,并挂载OSS Volume\n",
199239
"def create_processor(script):\n",
200240
" return models.Processor(\n",
201-
" # 当按照前面的可选步骤开启fuse亲和性调度后, 添加下列标签, 从而实现数据处理的最佳性能\n",
202-
" # pod_metadata=models.PodMetadata(\n",
203-
" # labels={\"fuse.serverful.fluid.io/inject\": \"true\"},\n",
204-
" # ),\n",
205241
" script=models.ScriptProcessor(\n",
206242
" command=[\"bash\"],\n",
207243
" source=script,\n",
@@ -229,14 +265,22 @@
229265
"- **创建任务模版:** 代码中封装了一个名为`create_processor`的任务模板函数,该函数接收一个bash脚本并把它传入作为某个容器的启动命令。该容器中定义了Python 3.10的运行环境,并在`/data`目录下挂载了OSS存储数据源。"
230266
]
231267
},
268+
{
269+
"cell_type": "markdown",
270+
"metadata": {},
271+
"source": [
272+
"> 注意:挂载OSS存储数据源前需要在集群提前创建名为`pvc-oss`的PersistentVolumeClaim(PVC)资源,并将其绑定一个OSS类型的PersistentVolume(PV)资源上。PV资源需要指定数据准备步骤中上传的Bucket路径。"
273+
]
274+
},
232275
{
233276
"cell_type": "code",
234-
"execution_count": null,
277+
"execution_count": 5,
235278
"metadata": {},
236279
"outputs": [],
237280
"source": [
238281
"# 定义数据预处理脚本\n",
239282
"preprocess_data_script = \"\"\"\n",
283+
"# pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple\n",
240284
"pip3 install numpy pandas pyarrow requests vineyard scikit-learn==1.4.0 joblib==1.3.2\n",
241285
"#!/bin/bash\n",
242286
"set -ex\n",
@@ -272,6 +316,7 @@
272316
"\n",
273317
"# 定义模型训练脚本\n",
274318
"train_data_script = \"\"\"\n",
319+
"# pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple\n",
275320
"pip3 install numpy pandas pyarrow requests vineyard scikit-learn==1.4.0 joblib==1.3.2\n",
276321
"#!/bin/bash\n",
277322
"set -ex\n",
@@ -297,6 +342,7 @@
297342
"\n",
298343
"# 定义模型测试脚本\n",
299344
"test_data_script = \"\"\"\n",
345+
"# pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple\n",
300346
"pip3 install numpy pandas pyarrow requests vineyard scikit-learn==1.4.0 joblib==1.3.2\n",
301347
"#!/bin/bash\n",
302348
"set -ex\n",
@@ -339,7 +385,7 @@
339385
},
340386
{
341387
"cell_type": "code",
342-
"execution_count": null,
388+
"execution_count": 6,
343389
"metadata": {},
344390
"outputs": [],
345391
"source": [
@@ -352,7 +398,7 @@
352398
},
353399
{
354400
"cell_type": "code",
355-
"execution_count": null,
401+
"execution_count": 7,
356402
"metadata": {},
357403
"outputs": [],
358404
"source": [
@@ -370,7 +416,7 @@
370416
},
371417
{
372418
"cell_type": "code",
373-
"execution_count": null,
419+
"execution_count": 8,
374420
"metadata": {},
375421
"outputs": [],
376422
"source": [
@@ -380,8 +426,22 @@
380426
}
381427
],
382428
"metadata": {
429+
"kernelspec": {
430+
"display_name": "python",
431+
"language": "python",
432+
"name": "python3"
433+
},
383434
"language_info": {
384-
"name": "python"
435+
"codemirror_mode": {
436+
"name": "ipython",
437+
"version": 3
438+
},
439+
"file_extension": ".py",
440+
"mimetype": "text/x-python",
441+
"name": "python",
442+
"nbconvert_exporter": "python",
443+
"pygments_lexer": "ipython3",
444+
"version": "3.10.13"
385445
}
386446
},
387447
"nbformat": 4,

0 commit comments

Comments
 (0)