Skip to content

Commit

Permalink
Some initial attributes for dataframe
Browse files Browse the repository at this point in the history
  • Loading branch information
Ron Thompson committed Jun 13, 2018
1 parent 4b0ca4f commit ee485b0
Show file tree
Hide file tree
Showing 4 changed files with 58 additions and 9 deletions.
11 changes: 10 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,13 @@
Welcome to koalas, the wrapper on the wrapper for Spark. The real iteration of
this should circumvent using PySpark, but for now we're going to use that as our
base. We've found the learning the syntax and some of the functionality from
pandas doesn't necessarily exist, so koalas looks to tackle that problem.

NOTE: This assumes you are using Databricks as they give you by default the
SparkSession. Will endeavor to add the functionality later.

To do:
- Deal with caching results as execute rather than chaining
- Deal with caching results as execute rather than chaining (this is going to be
tricky).
- Have upon return execute show()
- deal with conversion of types
7 changes: 1 addition & 6 deletions koalas/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1 @@
from .dataframe import DataFrame

def from_query(query=None):
return DataFrame(query)

def from_parquet(file=None):
from .frame import DataFrame
45 changes: 43 additions & 2 deletions koalas/frame.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,60 @@
import pandas as pd

class DataFrame(object):
def __init__(self, data=None, columns=None, query=None):

def __getattr__(cls, name):
"""
This needs work, need to return column object with available methods
"""
if name not in cls.__frame.columns:
raise AttributeError
from pyspark.sql.column import Column
return cls.__frame[name]

def __getitem__(cls, item):
return getattr(cls, item)

def __init__(self, data=None, columns=None, query=None, file=None):
"""
Ok so bad way of doing this right now, but it works, so stop complaining
"""
if data:
self.__frame = spark.createDataFrame(data, columns)
elif query:
self.__frame = spark.sql(query)

elif file:
self.__frame = spark.read.parquet(file)

@classmethod
def read_sql_query(cls, query=None):
"""
Read a sql query into a dataframe
"""
return cls(query=query)

@classmethod
def read_parquet(cls, file):
"""
read parquet file
"""
return cls(file=file)

@property
def dtypes(self):
"""
return datatypes as it's own pd.Series, similar to pandas
"""
data = self.__frame.dtypes
data = list(zip(*data))
return pd.Series(data[1], index=data[0])

@property
def columns(self):
return self.__frame.columns

@property
def filter(self, params):
return self.__frame.where(params)

def describe(self):
return self.__frame.describe()
4 changes: 4 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
setup(name='koalas',
version='0.0.0',
description='Pandas Like Wrapper for Spark',
packages=['koalas'])

0 comments on commit ee485b0

Please sign in to comment.