Some initial attributes for dataframe

ZenW00kie · Jun 13, 2018 · ee485b0 · ee485b0
1 parent 4b0ca4f
commit ee485b0
Show file tree

Hide file tree

Showing 4 changed files with 58 additions and 9 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,13 @@
+Welcome to koalas, the wrapper on the wrapper for Spark. The real iteration of
+this should circumvent using PySpark, but for now we're going to use that as our
+base. We've found the learning the syntax and some of the functionality from
+pandas doesn't necessarily exist, so koalas looks to tackle that problem.
+
+NOTE: This assumes you are using Databricks as they give you by default the
+SparkSession. Will endeavor to add the functionality later.
+
 To do:
-- Deal with caching results as execute rather than chaining
+- Deal with caching results as execute rather than chaining (this is going to be
+  tricky).
 - Have upon return execute show()
 - deal with conversion of types
diff --git a/koalas/__init__.py b/koalas/__init__.py
@@ -1,6 +1 @@
-from .dataframe import DataFrame
-
-def from_query(query=None):
-    return DataFrame(query)
-
-def from_parquet(file=None):
+from .frame import DataFrame
diff --git a/koalas/frame.py b/koalas/frame.py
@@ -1,19 +1,60 @@
 import pandas as pd
 
 class DataFrame(object):
-    def __init__(self, data=None, columns=None, query=None):
+
+    def __getattr__(cls, name):
+        """
+        This needs work, need to return column object with available methods
+        """
+        if name not in cls.__frame.columns:
+            raise AttributeError
+        from pyspark.sql.column import Column
+        return cls.__frame[name]
+
+    def __getitem__(cls, item):
+        return getattr(cls, item)
+
+    def __init__(self, data=None, columns=None, query=None, file=None):
+        """
+        Ok so bad way of doing this right now, but it works, so stop complaining
+        """
         if data:
             self.__frame = spark.createDataFrame(data, columns)
         elif query:
             self.__frame = spark.sql(query)
-
+        elif file:
+            self.__frame = spark.read.parquet(file)
 
     @classmethod
     def read_sql_query(cls, query=None):
+        """
+        Read a sql query into a dataframe
+        """
         return cls(query=query)
 
+    @classmethod
+    def read_parquet(cls, file):
+        """
+        read parquet file
+        """
+        return cls(file=file)
+
     @property
     def dtypes(self):
+        """
+        return datatypes as it's own pd.Series, similar to pandas
+        """
         data = self.__frame.dtypes
         data = list(zip(*data))
         return pd.Series(data[1], index=data[0])
+
+    @property
+    def columns(self):
+        return self.__frame.columns
+
+    @property
+    def filter(self, params):
+        return self.__frame.where(params)
+
+    def describe(self):
+        return self.__frame.describe()
diff --git a/setup.py b/setup.py
@@ -0,0 +1,4 @@
+setup(name='koalas',
+      version='0.0.0',
+      description='Pandas Like Wrapper for Spark',
+      packages=['koalas'])