sql-machine-learning
diff --git a/‎python/runtime/db.py‎
Lines changed: 55 additions & 409 deletions b/‎python/runtime/db.py‎
Lines changed: 55 additions & 409 deletions
diff --git a/‎python/runtime/db_test.py‎
Lines changed: 77 additions & 192 deletions b/‎python/runtime/db_test.py‎
Lines changed: 77 additions & 192 deletions
diff --git a/‎python/runtime/db_writer/hive.py‎
Lines changed: 2 additions & 23 deletions b/‎python/runtime/db_writer/hive.py‎
Lines changed: 2 additions & 23 deletions
diff --git a/‎python/runtime/dbapi/connection.py‎
Lines changed: 1 addition & 2 deletions b/‎python/runtime/dbapi/connection.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎python/runtime/dbapi/paiio.py‎
Lines changed: 56 additions & 26 deletions b/‎python/runtime/dbapi/paiio.py‎
Lines changed: 56 additions & 26 deletions
diff --git a/‎python/runtime/model/db.py‎
Lines changed: 1 addition & 1 deletion b/‎python/runtime/model/db.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/runtime/optimize/local.py‎
Lines changed: 1 addition & 2 deletions b/‎python/runtime/optimize/local.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎python/runtime/pai/kmeans.py‎
Lines changed: 1 addition & 1 deletion b/‎python/runtime/pai/kmeans.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/runtime/pai/random_forest.py‎
Lines changed: 1 addition & 1 deletion b/‎python/runtime/pai/random_forest.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/runtime/pai/submitter.py‎
Lines changed: 20 additions & 18 deletions b/‎python/runtime/pai/submitter.py‎
Lines changed: 20 additions & 18 deletions
@@ -39,26 +39,8 @@ def __init__(self,
         self.hdfs_user = hdfs_user
         self.hdfs_pass = hdfs_pass
 
-    def _column_list(self):
-        # NOTE(yancey1989): for the tablename: mydb.tblname, if 'mydb' is
-        # a tablename in the default database, Hive describe STATEMENT would
-        # mistake 'tblname' to a column name.
-        cursor = self.conn.cursor()
-        table_parts = self.table_name.split(".")
-        if len(table_parts) == 2:
-            db, table_name = table_parts[0], table_parts[1]
-            cursor.execute("use %s" % db)
-            cursor.execute("describe %s" % table_name)
-        elif len(table_parts) == 1:
-            cursor.execute("describe %s" % self.table_name)
-        else:
-            raise ValueError("")
-        result = cursor.fetchall()
-        cursor.execute("use %s " % self.conn.default_db)
-        return result
-
     def _indexing_table_schema(self, table_schema):
-        column_list = self._column_list()
+        column_list = self.conn.get_table_schema(self.table_name)
 
         schema_idx = []
         idx_map = {}
@@ -113,12 +95,9 @@ def write_hive_table(self):
             cmd_namenode_str, self.tmp_f.name, hdfs_path, self.table_name)
         subprocess.check_output(cmd_str.split(), env=hdfs_envs)
         # load CSV into Hive
-        cursor = self.conn.cursor()
         load_sql = "LOAD DATA INPATH '%s/%s/' OVERWRITE INTO TABLE %s" % (
             hdfs_path, self.table_name, self.table_name)
-        cursor.execute(load_sql)
-        self.conn.commit()
-        cursor.close()
+        self.conn.execute(load_sql)
 
         # remove the temporary dir on hdfs
         cmd_str = "hdfs dfs %s -rm -r -f %s/%s/" % (cmd_namenode_str,
 
@@ -17,8 +17,7 @@
 from six.moves.urllib.parse import parse_qs, urlparse
 
 
-@six.add_metaclass(ABCMeta)
-class ResultSet(object):
+class ResultSet(six.Iterator):
     """Base class for DB query result, caller can iteratable this object
     to get all result rows"""
     def __init__(self):
 
@@ -75,7 +75,11 @@ class PaiIOConnection(Connection):
     currently only support full-table reading. That means
     we can't filter the data, join the table and so on.
     The only supported query statement is `None`. The scheme
-    part of the uri can be 'paiio' or 'odps'
+    part of the uri can be 'paiio' or 'odps'.
+
+    A PaiIOConnection always binds to a specific table.
+    Init PaiIOConnection do not establish any real connection,
+    so, feel free to new a connection object when needed.
 
     Typical use is:
     con = PaiIOConnection("paiio://db/tables/my_table")
@@ -85,14 +89,17 @@ class PaiIOConnection(Connection):
     def __init__(self, conn_uri):
         super(PaiIOConnection, self).__init__(conn_uri)
         # (TODO: lhw) change driver to paiio
-        self.driver = "pai_maxcompute"
+        self.driver = "paiio"
         match = re.findall(r"\w+://\w+/tables/(.+)", conn_uri)
         if len(match) < 1:
             raise ValueError("Should specify table in uri with format: "
-                             "paiio://db/tables/table?param_a=a&param_b=b")
-        self.params["table"] = conn_uri.replace("paiio://", "odps://")
-        self.params["slice_id"] = self.params.get("slice_id", 0)
-        self.params["slice_count"] = self.params.get("slice_count", 1)
+                             "paiio://db/tables/table?param_a=a&param_b=b"
+                             "but get: %s" % conn_uri)
+
+        table = self.uripts._replace(scheme="odps", query="")
+        self.params["table"] = table.geturl()
+        self.params["slice_id"] = int(self.params.get("slice_id", "0"))
+        self.params["slice_count"] = int(self.params.get("slice_count", "1"))
 
     def _get_result_set(self, statement):
         if statement is not None:
@@ -106,44 +113,67 @@ def _get_result_set(self, statement):
         except Exception as e:
             return PaiIOResultSet(None, str(e))
 
-    def get_table_schema(self, full_uri):
-        """Get schema of given table, caller need to supply the full
-        uri for paiio table, this is slight different with other connections.
-        """
-        return PaiIOConnection.get_schema(full_uri)
-
     def query(self, statement=None):
         return super(PaiIOConnection, self).query(statement)
 
-    @staticmethod
-    def get_table_row_num(table_uri):
-        """Get row number of given table
-
-        Args:
-            table_uri: the full uri for the table to get row from
+    def get_table_row_num(self):
+        """Get row number of the binded table
 
         Return:
             Number of rows in the table
         """
-        reader = paiio.TableReader(table_uri)
+        reader = paiio.TableReader(self.params["table"])
         row_num = reader.get_row_count()
         reader.close()
         return row_num
 
-    @staticmethod
-    def get_schema(table_uri):
-        """Get schema of the given table
-
-        Args:
-            table_uri: the full uri for the table to get row from
+    def get_schema(self):
+        """Get schema of the binded table
 
         Returns:
             A list of column metas, like [(field_a, INT), (field_b, STRING)]
         """
-        rs = PaiIOConnection(table_uri).query()
+        rs = self.query()
         col_info = rs.column_info()
         rs.close()
         return col_info
 
+    @staticmethod
+    def from_table(table_name, slice_id=0, slice_count=1):
+        """Get a connection object from given table, if slice_count > 1
+        then, bind to a table slice
+
+        Args:
+            table_name: an odps table name in format: db.table
+            slice_id: the slice id for binding
+            slice_count: total slice count
+
+        Returns:
+            A PaiIOConnection object
+        """
+        uri = PaiIOConnection.get_uri_of_table(table_name, slice_id,
+                                               slice_count)
+        return PaiIOConnection(uri)
+
+    @staticmethod
+    def get_uri_of_table(table_name, slice_id=0, slice_count=1):
+        """Get a connection object from a talbe name
+
+        Args:
+            table_name: a table name in format: db.table
+            slice_id: the slice id for binding
+            slice_count: total slice count
+
+        Returns:
+            A uri for the talbe slice with which we can get a connection
+            by PaiIOConnection()
+        """
+        pts = table_name.split(".")
+        if len(pts) != 2:
+            raise ValueError("paiio table name should in db.table format.")
+        uri = "paiio://%s/tables/%s?slice_id=%d&slice_count=%d" % (
+            pts[0], pts[1], slice_id, slice_count)
+        return uri
+
     def close(self):
         pass
@@ -60,7 +60,7 @@ def write_with_generator(datasource, table, gen):
     _create_table(conn, table)
     idx = 0
 
-    with buffered_db_writer(conn.driver, conn, table, ["id", "block"]) as w:
+    with buffered_db_writer(conn, table, ["id", "block"]) as w:
         for d in gen():
             block = base64.b64encode(d)
             row = [idx, block]
 
@@ -231,8 +231,7 @@ def save_solved_result_in_db(solved_result, data_frame, variables,
     data_frame[result_value_name] = solved_result[0]
 
     conn = db.connect_with_data_source(datasource)
-    with db.buffered_db_writer(conn.driver, conn, result_table,
-                               column_names) as w:
+    with db.buffered_db_writer(conn, result_table, column_names) as w:
         for i in six.moves.range(len(data_frame)):
             rows = list(data_frame.loc[i])
             w.write(rows)
 
@@ -52,7 +52,7 @@ def get_train_kmeans_pai_cmd(datasource, model_name, data_table, model_attrs,
     ]
 
     conn = db.connect_with_data_source(datasource)
-    db.execute(conn, "DROP TABLE IF EXISTS %s" % idx_table_name)
+    conn.execute("DROP TABLE IF EXISTS %s" % idx_table_name)
 
     return (
         """pai -name kmeans -project algo_public """
 
@@ -66,7 +66,7 @@ def get_explain_random_forest_pai_cmd(datasource, model_name, data_table,
     conn = db.connect_with_data_source(datasource)
     schema = db.get_table_schema(conn, data_table)
     columns = [f[0] for f in schema]
-    db.execute(conn, "DROP TABLE IF EXISTS %s;" % result_table)
+    conn.execute("DROP TABLE IF EXISTS %s;" % result_table)
     return (
         """pai -name feature_importance -project algo_public """
         """-DmodelName="%s" -DinputTableName="%s"  -DoutputTableName="%s" """
 
@@ -21,6 +21,7 @@
 from os import path
 
 from runtime import db
+from runtime.dbapi.maxcompute import MaxComputeConnection
 from runtime.diagnostics import SQLFlowDiagnostic
 from runtime.model import EstimatorType, oss
 from runtime.pai import cluster_conf
@@ -47,6 +48,7 @@
 XGB_REQUIREMENT = TF_REQUIREMENT + """
 xgboost==0.82
 sklearn2pmml==0.56.0
+sklearn_pandas==1.6.0
 """
 
 
@@ -93,7 +95,7 @@ def create_tmp_table_from_select(select, datasource):
         tmp_tb_name, LIFECYCLE_ON_TMP_TABLE, select)
     # (NOTE: lhw) maxcompute conn doesn't support close
     # we should unify db interface
-    if not db.execute(conn, create_sql):
+    if not conn.execute(create_sql):
         raise SQLFlowDiagnostic("Can't crate tmp table for %s" % select)
     return "%s.%s" % (project, tmp_tb_name)
 
@@ -105,7 +107,7 @@ def drop_tables(tables, datasource):
         for table in tables:
             if table != "":
                 drop_sql = "DROP TABLE IF EXISTS %s" % table
-                db.execute(conn, drop_sql)
+                conn.execute(drop_sql)
     except:  # noqa: E722
         # odps will clear table itself, so even fail here, we do
         # not need to raise error
@@ -130,15 +132,18 @@ def get_oss_model_url(model_full_path):
     return "oss://%s/%s" % (oss.SQLFLOW_MODELS_BUCKET, model_full_path)
 
 
+def parse_maxcompute_dsn(datasource):
+    return MaxComputeConnection.get_uri_parts(datasource)
+
+
 def drop_pai_model(datasource, model_name):
     """Drop PAI model
 
     Args:
         datasource: current datasource
         model_name: name of the model to drop
     """
-    dsn = get_datasource_dsn(datasource)
-    user, passwd, address, database = db.parseMaxComputeDSN(dsn)
+    user, passwd, address, database = parse_maxcompute_dsn(datasource)
     cmd = "drop offlinemodel if exists %s" % model_name
     subprocess.run([
         "odpscmd", "-u", user, "-p", passwd, "--project", database,
@@ -215,8 +220,7 @@ def submit_pai_task(pai_cmd, datasource):
         pai_cmd: The command to submit
         datasource: The datasource this cmd will manipulate
     """
-    dsn = get_datasource_dsn(datasource)
-    user, passwd, address, project = db.parseMaxComputeDSN(dsn)
+    user, passwd, address, project = parse_maxcompute_dsn(datasource)
     cmd = [
         "odpscmd", "--instance-priority", "9", "-u", user, "-p", passwd,
         "--project", project, "--endpoint", address, "-e", pai_cmd
@@ -230,8 +234,7 @@ def submit_pai_task(pai_cmd, datasource):
 def get_oss_model_save_path(datasource, model_name):
     if not model_name:
         return None
-    dsn = get_datasource_dsn(datasource)
-    user, _, _, project = db.parseMaxComputeDSN(dsn)
+    user, _, _, project = parse_maxcompute_dsn(datasource)
     user = user or "unknown"
     return "/".join([project, user, model_name])
 
@@ -246,8 +249,7 @@ def get_project(datasource):
     Args:
         datasource: The odps url to extract project
     """
-    dsn = get_datasource_dsn(datasource)
-    _, _, _, project = db.parseMaxComputeDSN(dsn)
+    _, _, _, project = parse_maxcompute_dsn(datasource)
     return project
 
 
@@ -547,14 +549,14 @@ def create_predict_result_table(datasource, select, result_table, label_column,
         model_type: type of model defined in runtime.model.oss
     """
     conn = db.connect_with_data_source(datasource)
-    db.execute(conn, "DROP TABLE IF EXISTS %s" % result_table)
+    conn.execute("DROP TABLE IF EXISTS %s" % result_table)
     # PAI ml will create result table itself
     if model_type == EstimatorType.PAIML:
         return
 
     create_table_sql = "CREATE TABLE %s AS SELECT * FROM %s LIMIT 0" % (
         result_table, select)
-    db.execute(conn, create_table_sql)
+    conn.execute(create_table_sql)
 
     # if label is not in data table, add a int column for it
     schema = db.get_table_schema(conn, result_table)
@@ -565,11 +567,11 @@ def create_predict_result_table(datasource, select, result_table, label_column,
             break
     col_names = [col[0] for col in schema]
     if label_column not in col_names:
-        db.execute(
+        conn.execute(
             conn, "ALTER TABLE %s ADD %s %s" %
             (result_table, label_column, col_type))
     if train_label_column != label_column and train_label_column in col_names:
-        db.execute(
+        conn.execute(
             conn, "ALTER TABLE %s DROP COLUMN %s" %
             (result_table, train_label_column))
 
@@ -668,7 +670,7 @@ def create_explain_result_table(datasource, data_table, result_table,
     """
     conn = db.connect_with_data_source(datasource)
     drop_stmt = "DROP TABLE IF EXISTS %s" % result_table
-    db.execute(conn, drop_stmt)
+    conn.execute(drop_stmt)
 
     create_stmt = ""
     if model_type == EstimatorType.PAIML:
@@ -703,7 +705,7 @@ def create_explain_result_table(datasource, data_table, result_table,
             "not supported modelType %d for creating Explain result table" %
             model_type)
 
-    if not db.execute(conn, create_stmt):
+    if not conn.execute(create_stmt):
         raise SQLFlowDiagnostic("Can't create explain result table")
 
 
@@ -731,7 +733,7 @@ def get_explain_random_forests_cmd(datasource, model_name, data_table,
 
     conn = db.connect_with_data_source(datasource)
     # drop result table if exists
-    db.execute(conn, "DROP TABLE IF EXISTS %s;" % result_table)
+    conn.execute("DROP TABLE IF EXISTS %s;" % result_table)
     schema = db.get_table_schema(conn, data_table)
     fields = [f[0] for f in schema if f[0] != label_column]
     return ('''pai -name feature_importance -project algo_public '''
@@ -846,7 +848,7 @@ def create_evaluate_result_table(datasource, result_table, metrics):
     sql = "CREATE TABLE IF NOT EXISTS %s (%s);" % (result_table,
                                                    ",".join(fields))
     conn = db.connect_with_data_source(datasource)
-    db.execute(conn, sql)
+    conn.execute(sql)
 
 
 def submit_pai_evaluate(datasource, model_name, select, result_table,
Original file line number	Diff line number	Diff line change
`@@ -52,7 +52,7 @@ def get_train_kmeans_pai_cmd(datasource, model_name, data_table, model_attrs,`
`52`	`52`	`]`
`53`	`53`
`54`	`54`	`conn = db.connect_with_data_source(datasource)`
`55`		`- db.execute(conn, "DROP TABLE IF EXISTS %s" % idx_table_name)`
	`55`	`+ conn.execute("DROP TABLE IF EXISTS %s" % idx_table_name)`
`56`	`56`
`57`	`57`	`return (`
`58`	`58`	`"""pai -name kmeans -project algo_public """`