|
63 | 63 | "execution_count": null, |
64 | 64 | "metadata": {}, |
65 | 65 | "outputs": [], |
| 66 | + "source": [ |
| 67 | + "%pip install oss2 numpy pandas" |
| 68 | + ] |
| 69 | + }, |
| 70 | + { |
| 71 | + "cell_type": "code", |
| 72 | + "execution_count": 1, |
| 73 | + "metadata": {}, |
| 74 | + "outputs": [], |
| 75 | + "source": [ |
| 76 | + "import os\n", |
| 77 | + "os.environ[\"OSS_ACCESS_KEY_ID\"] = \"<YOUR_ACCESS_KEY>\"\n", |
| 78 | + "os.environ[\"OSS_ACCESS_KEY_SECRET\"] = \"<YOUR_ACCESS_SECRET>\"" |
| 79 | + ] |
| 80 | + }, |
| 81 | + { |
| 82 | + "cell_type": "code", |
| 83 | + "execution_count": 2, |
| 84 | + "metadata": {}, |
| 85 | + "outputs": [ |
| 86 | + { |
| 87 | + "name": "stdout", |
| 88 | + "output_type": "stream", |
| 89 | + "text": [ |
| 90 | + "DataFrame initialized.\n" |
| 91 | + ] |
| 92 | + }, |
| 93 | + { |
| 94 | + "data": { |
| 95 | + "text/plain": [ |
| 96 | + "<oss2.models.PutObjectResult at 0x7f8e7b9b1de0>" |
| 97 | + ] |
| 98 | + }, |
| 99 | + "execution_count": 2, |
| 100 | + "metadata": {}, |
| 101 | + "output_type": "execute_result" |
| 102 | + } |
| 103 | + ], |
66 | 104 | "source": [ |
67 | 105 | "import numpy as np\n", |
68 | 106 | "import pandas as pd\n", |
69 | 107 | "\n", |
70 | | - "# 生成大小约为22G的dataframe\n", |
71 | | - "num_rows = 6000 * 10000\n", |
| 108 | + "# Fake data\n", |
| 109 | + "num_rows = 600 * 1000\n", |
72 | 110 | "df = pd.DataFrame({\n", |
73 | 111 | " 'Id': np.random.randint(1, 100000, num_rows),\n", |
74 | 112 | " 'MSSubClass': np.random.randint(20, 201, size=num_rows),\n", |
|
118 | 156 | " 'SalePrice': np.random.randint(50000, 800001, num_rows),\n", |
119 | 157 | "})\n", |
120 | 158 | "\n", |
| 159 | + "print(\"DataFrame initialized.\")\n", |
| 160 | + "\n", |
121 | 161 | "import oss2\n", |
122 | 162 | "import io\n", |
123 | 163 | "from oss2.credentials import EnvironmentVariableCredentialsProvider\n", |
124 | 164 | "# 请将您的 OSS accessKeyID 和 accessKeySecret 分别设置成环境变量 OSS_ACCESS_KEY_ID 和 OSS_ACCESS_KEY_SECRET\n", |
125 | 165 | "auth = oss2.ProviderAuth(EnvironmentVariableCredentialsProvider())\n", |
126 | 166 | "# 请将 OSS_ENDPOINT 和 BUCKET_NAME 替换为您的 OSS Endpoint 和 Bucket\n", |
127 | | - "bucket = oss2.Bucket(auth, 'OSS_ENDPOINT', 'BUCKET_NAME')\n", |
| 167 | + "bucket = oss2.Bucket(auth, 'oss-cn-beijing.aliyuncs.com', 'fluid-demo')\n", |
128 | 168 | "\n", |
129 | 169 | "bytes_buffer = io.BytesIO()\n", |
130 | 170 | "df.to_pickle(bytes_buffer)\n", |
|
140 | 180 | }, |
141 | 181 | { |
142 | 182 | "cell_type": "code", |
143 | | - "execution_count": null, |
| 183 | + "execution_count": 3, |
144 | 184 | "metadata": {}, |
145 | 185 | "outputs": [], |
146 | 186 | "source": [ |
|
190 | 230 | }, |
191 | 231 | { |
192 | 232 | "cell_type": "code", |
193 | | - "execution_count": null, |
| 233 | + "execution_count": 4, |
194 | 234 | "metadata": {}, |
195 | 235 | "outputs": [], |
196 | 236 | "source": [ |
197 | 237 | "from kubernetes.client import models as k8s_models\n", |
198 | 238 | "# 定义任务运行模版,并挂载OSS Volume\n", |
199 | 239 | "def create_processor(script):\n", |
200 | 240 | " return models.Processor(\n", |
201 | | - " # 当按照前面的可选步骤开启fuse亲和性调度后, 添加下列标签, 从而实现数据处理的最佳性能\n", |
202 | | - " # pod_metadata=models.PodMetadata(\n", |
203 | | - " # labels={\"fuse.serverful.fluid.io/inject\": \"true\"},\n", |
204 | | - " # ),\n", |
205 | 241 | " script=models.ScriptProcessor(\n", |
206 | 242 | " command=[\"bash\"],\n", |
207 | 243 | " source=script,\n", |
|
229 | 265 | "- **创建任务模版:** 代码中封装了一个名为`create_processor`的任务模板函数,该函数接收一个bash脚本并把它传入作为某个容器的启动命令。该容器中定义了Python 3.10的运行环境,并在`/data`目录下挂载了OSS存储数据源。" |
230 | 266 | ] |
231 | 267 | }, |
| 268 | + { |
| 269 | + "cell_type": "markdown", |
| 270 | + "metadata": {}, |
| 271 | + "source": [ |
| 272 | + "> 注意:挂载OSS存储数据源前需要在集群提前创建名为`pvc-oss`的PersistentVolumeClaim(PVC)资源,并将其绑定一个OSS类型的PersistentVolume(PV)资源上。PV资源需要指定数据准备步骤中上传的Bucket路径。" |
| 273 | + ] |
| 274 | + }, |
232 | 275 | { |
233 | 276 | "cell_type": "code", |
234 | | - "execution_count": null, |
| 277 | + "execution_count": 5, |
235 | 278 | "metadata": {}, |
236 | 279 | "outputs": [], |
237 | 280 | "source": [ |
238 | 281 | "# 定义数据预处理脚本\n", |
239 | 282 | "preprocess_data_script = \"\"\"\n", |
| 283 | + "# pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple\n", |
240 | 284 | "pip3 install numpy pandas pyarrow requests vineyard scikit-learn==1.4.0 joblib==1.3.2\n", |
241 | 285 | "#!/bin/bash\n", |
242 | 286 | "set -ex\n", |
|
272 | 316 | "\n", |
273 | 317 | "# 定义模型训练脚本\n", |
274 | 318 | "train_data_script = \"\"\"\n", |
| 319 | + "# pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple\n", |
275 | 320 | "pip3 install numpy pandas pyarrow requests vineyard scikit-learn==1.4.0 joblib==1.3.2\n", |
276 | 321 | "#!/bin/bash\n", |
277 | 322 | "set -ex\n", |
|
297 | 342 | "\n", |
298 | 343 | "# 定义模型测试脚本\n", |
299 | 344 | "test_data_script = \"\"\"\n", |
| 345 | + "# pip3 config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple\n", |
300 | 346 | "pip3 install numpy pandas pyarrow requests vineyard scikit-learn==1.4.0 joblib==1.3.2\n", |
301 | 347 | "#!/bin/bash\n", |
302 | 348 | "set -ex\n", |
|
339 | 385 | }, |
340 | 386 | { |
341 | 387 | "cell_type": "code", |
342 | | - "execution_count": null, |
| 388 | + "execution_count": 6, |
343 | 389 | "metadata": {}, |
344 | 390 | "outputs": [], |
345 | 391 | "source": [ |
|
352 | 398 | }, |
353 | 399 | { |
354 | 400 | "cell_type": "code", |
355 | | - "execution_count": null, |
| 401 | + "execution_count": 7, |
356 | 402 | "metadata": {}, |
357 | 403 | "outputs": [], |
358 | 404 | "source": [ |
|
370 | 416 | }, |
371 | 417 | { |
372 | 418 | "cell_type": "code", |
373 | | - "execution_count": null, |
| 419 | + "execution_count": 8, |
374 | 420 | "metadata": {}, |
375 | 421 | "outputs": [], |
376 | 422 | "source": [ |
|
380 | 426 | } |
381 | 427 | ], |
382 | 428 | "metadata": { |
| 429 | + "kernelspec": { |
| 430 | + "display_name": "python", |
| 431 | + "language": "python", |
| 432 | + "name": "python3" |
| 433 | + }, |
383 | 434 | "language_info": { |
384 | | - "name": "python" |
| 435 | + "codemirror_mode": { |
| 436 | + "name": "ipython", |
| 437 | + "version": 3 |
| 438 | + }, |
| 439 | + "file_extension": ".py", |
| 440 | + "mimetype": "text/x-python", |
| 441 | + "name": "python", |
| 442 | + "nbconvert_exporter": "python", |
| 443 | + "pygments_lexer": "ipython3", |
| 444 | + "version": "3.10.13" |
385 | 445 | } |
386 | 446 | }, |
387 | 447 | "nbformat": 4, |
|
0 commit comments