diff --git a/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/README.md b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/README.md new file mode 100644 index 000000000000..e6702e8917a5 --- /dev/null +++ b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/README.md @@ -0,0 +1,24 @@ +# Kaggle Competition Pipeline Sample + +## Pipeline Overview + +This is a pipeline for [house price prediction](https://www.kaggle.com/c/house-prices-advanced-regression-techniques), an entry-level competition in kaggle. We demonstrate how to complete a kaggle competition by creating a pipeline of steps including downloading data, preprocessing and visualizing data, train model and submitting results to kaggle website. + +* We refer to [the notebook by Raj Kumar Gupta](https://www.kaggle.com/rajgupta5/house-price-prediction) and [the notebook by Sergei Neviadomski](https://www.kaggle.com/neviadomski/how-to-get-to-top-25-with-simple-model-sklearn) in terms of model implementation as well as data visualization. + +* We use [kaggle python api](https://github.com/Kaggle/kaggle-api) to interact with kaggle site, such as downloading data and submiting result. More usage can be found in their documentation. + +* We use [cloud build](https://cloud.google.com/cloud-build/) for CI process. That is, we automatically triggered a build and run as soon as we pushed our code to github repo. You need to setup a trigger on cloud build for your github repo branch to achieve the CI process. + +## Notice +* You can authenticate to gcp services by either: Create a "user-gcp-sa" secret following the troubleshooting parts in [Kubeflow pipeline repo](https://github.com/kubeflow/pipelines/tree/master/manifests/kustomize), or configure workload identity as instructed in [this guide](https://cloud.google.com/kubernetes-engine/docs/how-to/workload-identity). This sample uses the first method, but this will soon be deprecated. We would recommend using second method to replace the use of "user-gcp-sa" service account in the future. + +## Usage + +* Substitute the constants in "substitutions" in cloudbuild.yaml +* Fill in your kaggle_username and kaggle_key in Dockerfiles(in the folder "download_dataset" and "submit_result") to authenticate to kaggle. You can get them from an API token created from your kaggle "My Account" page. +* Set up cloud build triggers to your github repo for Continuous Integration +* Replace the CLOUDSDK_COMPUTE_ZONE, CLOUDSDK_CONTAINER_CLUSTER in cloudbuild.yaml with your own zone and cluster +* Enable "Kubernetes Engine Developer" in cloud build setting +* Set your gs bucket public or grant cloud storage access to cloud build and kubeflow pipeline +* Try commit and push it to github repo \ No newline at end of file diff --git a/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/cloudbuild.yaml b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/cloudbuild.yaml new file mode 100644 index 000000000000..665ce878b2e5 --- /dev/null +++ b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/cloudbuild.yaml @@ -0,0 +1,183 @@ +steps: + - name: "gcr.io/cloud-builders/docker" + args: + [ + "build", + "-t", + "${_GCR_PATH}/kaggle_download:$COMMIT_SHA", + "-t", + "${_GCR_PATH}/kaggle_download:latest", + "${_CODE_PATH}/download_dataset", + "-f", + "${_CODE_PATH}/download_dataset/Dockerfile", + ] + id: "BuildDownloadDataImage" + + - name: "gcr.io/cloud-builders/docker" + args: + [ + "push", + "${_GCR_PATH}/kaggle_download:$COMMIT_SHA", + ] + id: "PushDownloadDataImage" + waitFor: ["BuildDownloadDataImage"] + + - name: "gcr.io/cloud-builders/docker" + args: + [ + "build", + "-t", + "${_GCR_PATH}/kaggle_visualize_table:$COMMIT_SHA", + "-t", + "${_GCR_PATH}/kaggle_visualize_table:latest", + "${_CODE_PATH}/visualize_table", + "-f", + "${_CODE_PATH}/visualize_table/Dockerfile", + ] + id: "BuildVisualizeTableImage" + + - name: "gcr.io/cloud-builders/docker" + args: + [ + "push", + "${_GCR_PATH}/kaggle_visualize_table:$COMMIT_SHA", + ] + id: "PushVisualizeTableImage" + waitFor: ["BuildVisualizeTableImage"] + + - name: "gcr.io/cloud-builders/docker" + args: + [ + "build", + "-t", + "${_GCR_PATH}/kaggle_visualize_html:$COMMIT_SHA", + "-t", + "${_GCR_PATH}/kaggle_visualize_html:latest", + "${_CODE_PATH}/visualize_html", + "-f", + "${_CODE_PATH}/visualize_html/Dockerfile", + ] + id: "BuildVisualizeHTMLImage" + + - name: "gcr.io/cloud-builders/docker" + args: + [ + "push", + "${_GCR_PATH}/kaggle_visualize_html:$COMMIT_SHA", + ] + id: "PushVisualizeHTMLImage" + waitFor: ["BuildVisualizeHTMLImage"] + + - name: "gcr.io/cloud-builders/docker" + args: + [ + "build", + "-t", + "${_GCR_PATH}/kaggle_train:$COMMIT_SHA", + "-t", + "${_GCR_PATH}/kaggle_train:latest", + "${_CODE_PATH}/train_model", + "-f", + "${_CODE_PATH}/train_model/Dockerfile", + ] + id: "BuildTrainImage" + + - name: "gcr.io/cloud-builders/docker" + args: + [ + "push", + "${_GCR_PATH}/kaggle_train:$COMMIT_SHA", + ] + id: "PushTrainImage" + waitFor: ["BuildTrainImage"] + + - name: "gcr.io/cloud-builders/docker" + args: + [ + "build", + "-t", + "${_GCR_PATH}/kaggle_submit:$COMMIT_SHA", + "-t", + "${_GCR_PATH}/kaggle_submit:latest", + "${_CODE_PATH}/submit_result", + "-f", + "${_CODE_PATH}/submit_result/Dockerfile", + ] + id: "BuildSubmitImage" + + - name: "gcr.io/cloud-builders/docker" + args: + [ + "push", + "${_GCR_PATH}/kaggle_submit:$COMMIT_SHA", + ] + id: "PushSubmitImage" + waitFor: ["BuildSubmitImage"] + + - name: "python:3.7-slim" + entrypoint: "/bin/sh" + args: [ + "-c", + "set -ex; + cd ${_CODE_PATH}; + pip3 install cffi==1.12.3 --upgrade; + pip3 install kfp==0.1.38; + sed -i 's|image: download_image_location|image: ${_GCR_PATH}/kaggle_download:$COMMIT_SHA|g' ./download_dataset/component.yaml; + sed -i 's|image: visualizetable_image_location|image: ${_GCR_PATH}/kaggle_visualize_table:$COMMIT_SHA|g' ./visualize_table/component.yaml; + sed -i 's|image: visualizehtml_image_location|image: ${_GCR_PATH}/kaggle_visualize_html:$COMMIT_SHA|g' ./visualize_html/component.yaml; + sed -i 's|image: train_image_location|image: ${_GCR_PATH}/kaggle_train:$COMMIT_SHA|g' ./train_model/component.yaml; + sed -i 's|image: submit_image_location|image: ${_GCR_PATH}/kaggle_submit:$COMMIT_SHA|g' ./submit_result/component.yaml; + python pipeline.py + --gcr_address ${_GCR_PATH}; + cp pipeline.py.zip /workspace/pipeline.zip", + ] + id: "KagglePackagePipeline" + + - name: "gcr.io/cloud-builders/gsutil" + args: + [ + "cp", + "/workspace/pipeline.zip", + "${_GS_BUCKET}/$COMMIT_SHA/pipeline.zip" + ] + id: "KaggleUploadPipeline" + waitFor: ["KagglePackagePipeline"] + + + - name: "gcr.io/cloud-builders/kubectl" + entrypoint: "/bin/sh" + args: [ + "-c", + "cd ${_CODE_PATH}; + apt-get update; + apt-get install -y python3-pip; + apt-get install -y libssl-dev libffi-dev; + /builder/kubectl.bash; + pip3 install kfp; + pip3 install kubernetes; + python3 create_pipeline_version_and_run.py + --pipeline_id ${_PIPELINE_ID} + --commit_sha $COMMIT_SHA + --bucket_name ${_GS_BUCKET} + --gcr_address ${_GCR_PATH}" + ] + env: + - "CLOUDSDK_COMPUTE_ZONE=[Your cluster zone, for example: us-central1-a]" + - "CLOUDSDK_CONTAINER_CLUSTER=[Your cluster name, for example: my-cluster]" + id: "KaggleCreatePipelineVersionAndRun" + +images: + - "${_GCR_PATH}/kaggle_download:latest" + - "${_GCR_PATH}/kaggle_visualize_table:latest" + - "${_GCR_PATH}/kaggle_visualize_html:latest" + - "${_GCR_PATH}/kaggle_train:latest" + - "${_GCR_PATH}/kaggle_submit:latest" + + +substitutions: + _CODE_PATH: /workspace/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample + _NAMESPACE: kubeflow + _GCR_PATH: [Your cloud registry path. For example, gcr.io/my-project-id] + _GS_BUCKET: [Name of your cloud storage bucket. For example, gs://my-project-bucket] + _PIPELINE_ID: [Your kubeflow pipeline id to create a version on. Get it from Kubeflow Pipeline UI. + For example, f6f8558a-6eec-4ef4-b343-a650473ee613] \ No newline at end of file diff --git a/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/create_pipeline_version_and_run.py b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/create_pipeline_version_and_run.py new file mode 100644 index 000000000000..7ec6fd8fba7a --- /dev/null +++ b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/create_pipeline_version_and_run.py @@ -0,0 +1,47 @@ +import kfp +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument('--commit_sha', help='Required. Commit SHA, for version name. Must be unique.', type=str) +parser.add_argument('--pipeline_id', help = 'Required. pipeline id',type=str) +parser.add_argument('--bucket_name', help='Required. gs bucket to store files', type=str) +parser.add_argument('--gcr_address', help='Required. Cloud registry address. For example, gcr.io/my-project', type=str) +parser.add_argument('--host', help='Host address of kfp.Client. Will be get from cluster automatically', type=str, default='') +parser.add_argument('--run_name', help='name of the new run.', type=str, default='') +parser.add_argument('--experiment_id', help = 'experiment id',type=str) +parser.add_argument('--code_source_url', help = 'url of source code', type=str, default='') +args = parser.parse_args() + +if args.host: + client = kfp.Client(host=args.host) +else: + client = kfp.Client() + +#create version +import os +package_url = os.path.join('https://storage.googleapis.com', args.bucket_name.lstrip('gs://'), args.commit_sha, 'pipeline.zip') +version_name = args.commit_sha +version_body = {"name": version_name, \ +"code_source_url": args.code_source_url, \ +"package_url": {"pipeline_url": package_url}, \ +"resource_references": [{"key": {"id": args.pipeline_id, "type":3}, "relationship":1}]} + +response = client.pipelines.create_pipeline_version(version_body) +version_id = response.id +# create run +run_name = args.run_name if args.run_name else 'run' + version_id +resource_references = [{"key": {"id": version_id, "type":4}, "relationship":2}] +if args.experiment_id: + resource_references.append({"key": {"id": args.experiment_id, "type":1}, "relationship": 1}) +run_body={"name":run_name, + "pipeline_spec":{"parameters": [{"name": "bucket_name", "value": args.bucket_name}, + {"name": "commit_sha", "value": args.commit_sha}]}, + "resource_references": resource_references} +try: + client.runs.create_run(run_body) +except: + print('Error Creating Run...') + + + + diff --git a/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/download_dataset/Dockerfile b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/download_dataset/Dockerfile new file mode 100644 index 000000000000..ed85764d1288 --- /dev/null +++ b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/download_dataset/Dockerfile @@ -0,0 +1,7 @@ +FROM python:3.7 +ENV KAGGLE_USERNAME=[YOUR KAGGLE USERNAME] \ + KAGGLE_KEY=[YOUR KAGGLE KEY] +RUN pip install kaggle +RUN pip install google-cloud-storage +COPY ./download_data.py . +CMD ["python", "download_data.py"] \ No newline at end of file diff --git a/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/download_dataset/component.yaml b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/download_dataset/component.yaml new file mode 100644 index 000000000000..f59290ac3cbe --- /dev/null +++ b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/download_dataset/component.yaml @@ -0,0 +1,15 @@ +name: download dataset +description: visualize training in tensorboard +inputs: + - {name: bucket_name, type: GCSPath} +outputs: + - {name: train_dataset, type: string} + - {name: test_dataset, type: string} +implementation: + container: + image: download_image_location + command: ['python', 'download_data.py'] + args: ['--bucket_name', {inputValue: bucket_name}] + fileOutputs: + train_dataset: /train.txt + test_dataset: /test.txt \ No newline at end of file diff --git a/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/download_dataset/download_data.py b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/download_dataset/download_data.py new file mode 100644 index 000000000000..7ae67d6696fe --- /dev/null +++ b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/download_dataset/download_data.py @@ -0,0 +1,31 @@ +""" +step #1: download data from kaggle website, and push it to gs bucket +""" + +def process_and_upload( + bucket_name +): + from google.cloud import storage + storage_client = storage.Client() + bucket = storage_client.get_bucket(bucket_name.lstrip('gs://')) + train_blob = bucket.blob('train.csv') + test_blob = bucket.blob('test.csv') + train_blob.upload_from_filename('train.csv') + test_blob.upload_from_filename('test.csv') + + with open('train.txt', 'w') as f: + f.write(bucket_name+'/train.csv') + with open('test.txt', 'w') as f: + f.write(bucket_name+'/test.csv') + +if __name__ == '__main__': + import os + os.system("kaggle competitions download -c house-prices-advanced-regression-techniques") + os.system("unzip house-prices-advanced-regression-techniques") + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--bucket_name', type=str) + args = parser.parse_args() + + process_and_upload(args.bucket_name) + \ No newline at end of file diff --git a/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/pipeline.py b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/pipeline.py new file mode 100644 index 000000000000..d9f69a691af0 --- /dev/null +++ b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/pipeline.py @@ -0,0 +1,40 @@ +import kfp.dsl as dsl +import kfp.components as components +from kfp.gcp import use_gcp_secret + +@dsl.pipeline( + name = "kaggle pipeline", + description = "kaggle pipeline that goes from download data, analyse data, train model to submit result" +) +def kaggle_houseprice( + bucket_name: str, + commit_sha: str +): + + downloadDataOp = components.load_component_from_file('./download_dataset/component.yaml') + downloadDataStep = downloadDataOp(bucket_name=bucket_name).apply(use_gcp_secret('user-gcp-sa')) + + visualizeTableOp = components.load_component_from_file('./visualize_table/component.yaml') + visualizeTableStep = visualizeTableOp(train_file_path='%s' % downloadDataStep.outputs['train_dataset']).apply(use_gcp_secret('user-gcp-sa')) + + visualizeHTMLOp = components.load_component_from_file('./visualize_html/component.yaml') + visualizeHTMLStep = visualizeHTMLOp(train_file_path='%s' % downloadDataStep.outputs['train_dataset'], + commit_sha=commit_sha, + bucket_name=bucket_name).apply(use_gcp_secret('user-gcp-sa')) + + trainModelOp = components.load_component_from_file('./train_model/component.yaml') + trainModelStep = trainModelOp(train_file='%s' % downloadDataStep.outputs['train_dataset'], + test_file='%s' % downloadDataStep.outputs['test_dataset'], + bucket_name=bucket_name).apply(use_gcp_secret('user-gcp-sa')) + + submitResultOp = components.load_component_from_file('./submit_result/component.yaml') + submitResultStep = submitResultOp(result_file='%s' % trainModelStep.outputs['result'], + submit_message='submit').apply(use_gcp_secret('user-gcp-sa')) + +if __name__ == '__main__': + import kfp.compiler as compiler + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--gcr_address', type = str) + args = parser.parse_args() + compiler.Compiler().compile(kaggle_houseprice, __file__ + '.zip') \ No newline at end of file diff --git a/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/submit_result/Dockerfile b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/submit_result/Dockerfile new file mode 100644 index 000000000000..8426ed1361ec --- /dev/null +++ b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/submit_result/Dockerfile @@ -0,0 +1,7 @@ +FROM python:3.7 +ENV KAGGLE_USERNAME=[YOUR KAGGLE USERNAME] \ + KAGGLE_KEY=[YOUR KAGGLE KEY] +RUN pip install kaggle +RUN pip install gcsfs +COPY ./submit_result.py . +CMD ["python", "submit_result.py"] \ No newline at end of file diff --git a/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/submit_result/component.yaml b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/submit_result/component.yaml new file mode 100644 index 000000000000..7ee0cec98493 --- /dev/null +++ b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/submit_result/component.yaml @@ -0,0 +1,11 @@ +name: submit result +description: submit prediction result to kaggle +inputs: + - {name: result_file, type: string} + - {name: submit_message, type: string} +implementation: + container: + image: submit_image_location + command: ['python', 'submit_result.py'] + args: ['--result_file', {inputValue: result_file}, + '--submit_message', {inputValue: submit_message}] \ No newline at end of file diff --git a/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/submit_result/submit_result.py b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/submit_result/submit_result.py new file mode 100644 index 000000000000..20237e805933 --- /dev/null +++ b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/submit_result/submit_result.py @@ -0,0 +1,23 @@ +""" +step #4: submit result to kaggle +""" + +def download_result( + result_file +): + import gcsfs + fs = gcsfs.GCSFileSystem() + fs.get(result_file, 'submission.csv') + +if __name__ == '__main__': + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--result_file', type=str) + parser.add_argument('--submit_message', type=str, default = 'default submit') + args = parser.parse_args() + + download_result(args.result_file) + import os + os.system("kaggle competitions submit -c house-prices-advanced-regression-techniques -f submission.csv -m " + args.submit_message) + + \ No newline at end of file diff --git a/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/train_model/Dockerfile b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/train_model/Dockerfile new file mode 100644 index 000000000000..6ff89ab3322e --- /dev/null +++ b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/train_model/Dockerfile @@ -0,0 +1,5 @@ +FROM python:3.7 +COPY ./train.py . +RUN pip install pandas==0.25.1 +RUN pip install gcsfs numpy matplotlib seaborn sklearn +CMD ["python", "train.py"] \ No newline at end of file diff --git a/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/train_model/component.yaml b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/train_model/component.yaml new file mode 100644 index 000000000000..eb13e7ed22e2 --- /dev/null +++ b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/train_model/component.yaml @@ -0,0 +1,17 @@ +name: train model +description: train and test +inputs: + - {name: train_file, type: string} + - {name: test_file, type: string} + - {name: bucket_name, type: string} +outputs: + - {name: result, type: string} +implementation: + container: + image: train_image_location + command: ['python', 'train.py'] + args: ['--train_file', {inputValue: train_file}, + '--test_file', {inputValue: test_file}, + '--bucket_name', {inputValue: bucket_name}] + fileOutputs: + result: /result_path.txt \ No newline at end of file diff --git a/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/train_model/train.py b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/train_model/train.py new file mode 100644 index 000000000000..24810923a069 --- /dev/null +++ b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/train_model/train.py @@ -0,0 +1,216 @@ +""" +Copyright 2020 Google LLC +Copyright (c) 2017 Sergei Neviadomski + +This file is modified based on code from kaggle user Sergei Neviadomski. +Original code can be found at: + + https://www.kaggle.com/neviadomski/how-to-get-to-top-25-with-simple-model-sklearn + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" +def train( + train_file, + test_file, + bucket_name +): + train = pd.read_csv(train_file) + test = pd.read_csv(test_file) + + # Prints R2 and RMSE scores + def get_score(prediction, lables): + print('R2: {}'.format(r2_score(prediction, lables))) + print('RMSE: {}'.format(np.sqrt(mean_squared_error(prediction, lables)))) + + # Shows scores for train and validation sets + def train_test(estimator, x_trn, x_tst, y_trn, y_tst): + prediction_train = estimator.predict(x_trn) + # Printing estimator + print(estimator) + # Printing train scores + get_score(prediction_train, y_trn) + prediction_test = estimator.predict(x_tst) + # Printing test scores + print("Test") + get_score(prediction_test, y_tst) + + # Spliting to features and lables and deleting variable I don't need + train_labels = train.pop('SalePrice') + + features = pd.concat([train, test], keys=['train', 'test']) + + # I decided to get rid of features that have more than half of missing information or do not correlate to SalePrice + features.drop(['Utilities', 'RoofMatl', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'Heating', 'LowQualFinSF', + 'BsmtFullBath', 'BsmtHalfBath', 'Functional', 'GarageYrBlt', 'GarageArea', 'GarageCond', 'WoodDeckSF', + 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal'], + axis=1, inplace=True) + + # MSSubClass as str + features['MSSubClass'] = features['MSSubClass'].astype(str) + + # MSZoning NA in pred. filling with most popular values + features['MSZoning'] = features['MSZoning'].fillna(features['MSZoning'].mode()[0]) + + # LotFrontage NA in all. I suppose NA means 0 + features['LotFrontage'] = features['LotFrontage'].fillna(features['LotFrontage'].mean()) + + # Alley NA in all. NA means no access + features['Alley'] = features['Alley'].fillna('NOACCESS') + + # Converting OverallCond to str + features.OverallCond = features.OverallCond.astype(str) + + # MasVnrType NA in all. filling with most popular values + features['MasVnrType'] = features['MasVnrType'].fillna(features['MasVnrType'].mode()[0]) + + # BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, BsmtFinType2 + # NA in all. NA means No basement + for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'): + features[col] = features[col].fillna('NoBSMT') + + # TotalBsmtSF NA in pred. I suppose NA means 0 + features['TotalBsmtSF'] = features['TotalBsmtSF'].fillna(0) + + # Electrical NA in pred. filling with most popular values + features['Electrical'] = features['Electrical'].fillna(features['Electrical'].mode()[0]) + + # KitchenAbvGr to categorical + features['KitchenAbvGr'] = features['KitchenAbvGr'].astype(str) + + # KitchenQual NA in pred. filling with most popular values + features['KitchenQual'] = features['KitchenQual'].fillna(features['KitchenQual'].mode()[0]) + + # FireplaceQu NA in all. NA means No Fireplace + features['FireplaceQu'] = features['FireplaceQu'].fillna('NoFP') + + # GarageType, GarageFinish, GarageQual NA in all. NA means No Garage + for col in ('GarageType', 'GarageFinish', 'GarageQual'): + features[col] = features[col].fillna('NoGRG') + + # GarageCars NA in pred. I suppose NA means 0 + features['GarageCars'] = features['GarageCars'].fillna(0.0) + + # SaleType NA in pred. filling with most popular values + features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0]) + + # Year and Month to categorical + features['YrSold'] = features['YrSold'].astype(str) + features['MoSold'] = features['MoSold'].astype(str) + + # Adding total sqfootage feature and removing Basement, 1st and 2nd floor features + features['TotalSF'] = features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF'] + features.drop(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF'], axis=1, inplace=True) + + ## Log transformation of labels + train_labels = np.log(train_labels) + + ## Standardizing numeric features + numeric_features = features.loc[:,['LotFrontage', 'LotArea', 'GrLivArea', 'TotalSF']] + numeric_features_standardized = (numeric_features - numeric_features.mean())/numeric_features.std() + + + # Getting Dummies from Condition1 and Condition2 + conditions = set([x for x in features['Condition1']] + [x for x in features['Condition2']]) + dummies = pd.DataFrame(data=np.zeros((len(features.index), len(conditions))), + index=features.index, columns=conditions) + for i, cond in enumerate(zip(features['Condition1'], features['Condition2'])): + dummies.ix[i, cond] = 1 + features = pd.concat([features, dummies.add_prefix('Condition_')], axis=1) + features.drop(['Condition1', 'Condition2'], axis=1, inplace=True) + + # Getting Dummies from Exterior1st and Exterior2nd + exteriors = set([x for x in features['Exterior1st']] + [x for x in features['Exterior2nd']]) + dummies = pd.DataFrame(data=np.zeros((len(features.index), len(exteriors))), + index=features.index, columns=exteriors) + for i, ext in enumerate(zip(features['Exterior1st'], features['Exterior2nd'])): + dummies.ix[i, ext] = 1 + features = pd.concat([features, dummies.add_prefix('Exterior_')], axis=1) + features.drop(['Exterior1st', 'Exterior2nd', 'Exterior_nan'], axis=1, inplace=True) + + # Getting Dummies from all other categorical vars + for col in features.dtypes[features.dtypes == 'object'].index: + for_dummy = features.pop(col) + features = pd.concat([features, pd.get_dummies(for_dummy, prefix=col)], axis=1) + + ### Copying features + features_standardized = features.copy() + + ### Replacing numeric features by standardized values + features_standardized.update(numeric_features_standardized) + + ### Splitting features + train_features = features.loc['train'].drop('Id', axis=1).select_dtypes(include=[np.number]).values + test_features = features.loc['test'].drop('Id', axis=1).select_dtypes(include=[np.number]).values + + ### Splitting standardized features + train_features_st = features_standardized.loc['train'].drop('Id', axis=1).select_dtypes(include=[np.number]).values + test_features_st = features_standardized.loc['test'].drop('Id', axis=1).select_dtypes(include=[np.number]).values + + ### Shuffling train sets + train_features_st, train_features, train_labels = shuffle(train_features_st, train_features, train_labels, random_state = 5) + + ### Splitting + x_train, x_test, y_train, y_test = train_test_split(train_features, train_labels, test_size=0.1, random_state=200) + x_train_st, x_test_st, y_train_st, y_test_st = train_test_split(train_features_st, train_labels, test_size=0.1, random_state=200) + + ENSTest = linear_model.ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10], l1_ratio=[.01, .1, .5, .9, .99], max_iter=5000).fit(x_train_st, y_train_st) + train_test(ENSTest, x_train_st, x_test_st, y_train_st, y_test_st) + + # Average R2 score and standart deviation of 5-fold cross-validation + scores = cross_val_score(ENSTest, train_features_st, train_labels, cv=5) + print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) + + GBest = ensemble.GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=3, max_features='sqrt', + min_samples_leaf=15, min_samples_split=10, loss='huber').fit(x_train, y_train) + train_test(GBest, x_train, x_test, y_train, y_test) + + # Average R2 score and standart deviation of 5-fold cross-validation + scores = cross_val_score(GBest, train_features_st, train_labels, cv=5) + print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) + + # Retraining models + GB_model = GBest.fit(train_features, train_labels) + ENST_model = ENSTest.fit(train_features_st, train_labels) + + ## Getting our SalePrice estimation + Final_labels = (np.exp(GB_model.predict(test_features)) + np.exp(ENST_model.predict(test_features_st))) / 2 + + ## Saving to CSV + import os + result_path = os.path.join(bucket_name, 'submission.csv') + pd.DataFrame({'Id': test.Id, 'SalePrice': Final_labels}).to_csv(result_path, index =False) + + with open('/result_path.txt', 'w') as f: + f.write(result_path) + +if __name__ == '__main__': + # Adding needed libraries and reading data + import pandas as pd + import numpy as np + from sklearn import ensemble, tree, linear_model + from sklearn.model_selection import train_test_split, cross_val_score + from sklearn.metrics import r2_score, mean_squared_error + from sklearn.utils import shuffle + import warnings + warnings.filterwarnings('ignore') + + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument('--train_file', type=str) + parser.add_argument('--test_file', type=str) + parser.add_argument('--bucket_name', type=str) + + args = parser.parse_args() + train(args.train_file, args.test_file, args.bucket_name) + diff --git a/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/visualize_html/Dockerfile b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/visualize_html/Dockerfile new file mode 100644 index 000000000000..ebe4fc4edb4f --- /dev/null +++ b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/visualize_html/Dockerfile @@ -0,0 +1,4 @@ +FROM tensorflow/tensorflow:2.0.0-py3 +RUN pip install gcsfs pandas matplotlib seaborn +COPY ./visualize.py . +CMD ["python", 'visualize.py'] \ No newline at end of file diff --git a/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/visualize_html/component.yaml b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/visualize_html/component.yaml new file mode 100644 index 000000000000..38cf640e2079 --- /dev/null +++ b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/visualize_html/component.yaml @@ -0,0 +1,17 @@ +name: visualize in html +description: visualize dataset in picture written in html +inputs: + - {name: train_file_path, type: string} + - {name: commit_sha, type: string} + - {name: bucket_name, type: string} +outputs: + - {name: MLPipeline UI metadata, type: UI metadata} +implementation: + container: + image: visualizehtml_image_location + command: ['python', 'visualize.py'] + args: ['--train_file_path', {inputValue: train_file_path}, + '--commit_sha', {inputValue: commit_sha}, + '--bucket_name', {inputValue: bucket_name}] + fileOutputs: + MLPipeline UI metadata: /mlpipeline-ui-metadata.json \ No newline at end of file diff --git a/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/visualize_html/visualize.py b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/visualize_html/visualize.py new file mode 100644 index 000000000000..99476b6cfceb --- /dev/null +++ b/samples/contrib/versioned-pipeline-ci-samples/kaggle-ci-sample/visualize_html/visualize.py @@ -0,0 +1,54 @@ +# visualizer with html + +def datahtml( + bucket_name, + commit_sha, + train_file_path +): + import json + import seaborn as sns + import matplotlib.pyplot as plt + import os + image_path = os.path.join(bucket_name, commit_sha, 'visualization.png') + image_url = os.path.join('https://storage.googleapis.com', bucket_name.lstrip('gs://'), commit_sha, 'visualization.png') + html_path = os.path.join(bucket_name, 'kaggle.html') + # ouptut visualization to a file + + import pandas as pd + df_train = pd.read_csv(train_file_path) + sns.set() + cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt'] + sns.pairplot(df_train[cols], size = 3) + plt.savefig('visualization.png') + from tensorflow.python.lib.io import file_io + file_io.copy('visualization.png', image_path) + rendered_template = """ + +
+