Skip to content

Commit

Permalink
Merge pull request #157 from rokostik/spreadsheet-joins
Browse files Browse the repository at this point in the history
Implement spreadsheet joins
  • Loading branch information
refaktor committed Mar 16, 2024
2 parents acf03fb + d714f46 commit ada2ef3
Show file tree
Hide file tree
Showing 3 changed files with 202 additions and 10 deletions.
2 changes: 1 addition & 1 deletion env/spreadsheet.go
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ func (s Spreadsheet) Columns(ps *ProgramState, names []string) Object {
return *nspr
}

func (s Spreadsheet) GetRow(ps *ProgramState, index int) Object {
func (s Spreadsheet) GetRow(ps *ProgramState, index int) SpreadsheetRow {
row := s.Rows[index]
row.Uplink = &s
return row
Expand Down
163 changes: 155 additions & 8 deletions evaldo/builtins_spreadsheet.go
Original file line number Diff line number Diff line change
Expand Up @@ -567,7 +567,11 @@ var Builtins_spreadsheet = map[string]*env.Builtin{
if !ok {
return MakeBuiltinError(ps, "Second element of replacement block must be a string.", "add-col!")
}
return GenerateColumnRegexReplace(ps, spr, newCol, fromCols, regex, replaceStr.Value)
err := GenerateColumnRegexReplace(ps, &spr, newCol, fromCols, regex, replaceStr.Value)
if err != nil {
return err
}
return spr
default:
return MakeArgError(ps, 3, []env.Type{env.BlockType}, "add-col!")
}
Expand All @@ -582,9 +586,9 @@ var Builtins_spreadsheet = map[string]*env.Builtin{
}
},
},
"add-index!": {
"add-indexes!": {
Argsn: 2,
Doc: "Indexes all values in a colun and istre it,",
Doc: "Creates an index for all values in the provided columns. Changes in-place and returns the new spreadsheet.",
Fn: func(ps *env.ProgramState, arg0 env.Object, arg1 env.Object, arg2 env.Object, arg3 env.Object, arg4 env.Object) env.Object {
switch spr := arg0.(type) {
case env.Spreadsheet:
Expand All @@ -599,9 +603,9 @@ var Builtins_spreadsheet = map[string]*env.Builtin{
return MakeError(ps, "Block of tagwords needed")
}
}
res := AddIndexes(ps, &spr, colWords)
if res != nil {
return res
err := AddIndexes(ps, &spr, colWords)
if err != nil {
return err
}
return spr
default:
Expand All @@ -612,6 +616,22 @@ var Builtins_spreadsheet = map[string]*env.Builtin{
}
},
},
"indexes?": {
Argsn: 1,
Doc: "Returns the columns that are indexed in a spreadsheet.",
Fn: func(ps *env.ProgramState, arg0 env.Object, arg1 env.Object, arg2 env.Object, arg3 env.Object, arg4 env.Object) (res env.Object) {
switch spr := arg0.(type) {
case env.Spreadsheet:
res := make([]env.Object, 0)
for col := range spr.Indexes {
res = append(res, *env.NewString(col))
}
return *env.NewBlock(*env.NewTSeries(res))
default:
return MakeArgError(ps, 1, []env.Type{env.SpreadsheetType}, "indexes?")
}
},
},
"autotype": {
Argsn: 2,
Doc: "Takes a spreadsheet and tries to determine and change the types of columns.",
Expand All @@ -629,6 +649,70 @@ var Builtins_spreadsheet = map[string]*env.Builtin{
}
},
},
"left-join": {
Argsn: 4,
Doc: "Left joins two spreadsheets on the given columns.",
Fn: func(ps *env.ProgramState, arg0 env.Object, arg1 env.Object, arg2 env.Object, arg3 env.Object, arg4 env.Object) (res env.Object) {
switch spr1 := arg0.(type) {
case env.Spreadsheet:
switch spr2 := arg1.(type) {
case env.Spreadsheet:
switch col1 := arg2.(type) {
case env.Word:
col2, ok := arg3.(env.Word)
if !ok {
return MakeArgError(ps, 4, []env.Type{env.WordType}, "left-join")
}
return LeftJoin(ps, spr1, spr2, ps.Idx.GetWord(col1.Index), ps.Idx.GetWord(col2.Index), false)
case env.String:
col2, ok := arg3.(env.String)
if !ok {
MakeArgError(ps, 4, []env.Type{env.StringType}, "left-join")
}
return LeftJoin(ps, spr1, spr2, col1.Value, col2.Value, false)
default:
return MakeArgError(ps, 3, []env.Type{env.WordType, env.StringType}, "left-join")
}
default:
return MakeArgError(ps, 2, []env.Type{env.SpreadsheetType}, "left-join")
}
default:
return MakeArgError(ps, 1, []env.Type{env.SpreadsheetType}, "left-join")
}
},
},
"inner-join": {
Argsn: 4,
Doc: "Inner joins two spreadsheets on the given columns.",
Fn: func(ps *env.ProgramState, arg0 env.Object, arg1 env.Object, arg2 env.Object, arg3 env.Object, arg4 env.Object) (res env.Object) {
switch spr1 := arg0.(type) {
case env.Spreadsheet:
switch spr2 := arg1.(type) {
case env.Spreadsheet:
switch col1 := arg2.(type) {
case env.Word:
col2, ok := arg3.(env.Word)
if !ok {
return MakeArgError(ps, 4, []env.Type{env.WordType}, "inner-join")
}
return LeftJoin(ps, spr1, spr2, ps.Idx.GetWord(col1.Index), ps.Idx.GetWord(col2.Index), true)
case env.String:
col2, ok := arg3.(env.String)
if !ok {
MakeArgError(ps, 4, []env.Type{env.StringType}, "inner-join")
}
return LeftJoin(ps, spr1, spr2, col1.Value, col2.Value, true)
default:
return MakeArgError(ps, 3, []env.Type{env.WordType, env.StringType}, "inner-join")
}
default:
return MakeArgError(ps, 2, []env.Type{env.SpreadsheetType}, "inner-join")
}
default:
return MakeArgError(ps, 1, []env.Type{env.SpreadsheetType}, "inner-join")
}
},
},
}

func GenerateColumn(ps *env.ProgramState, s env.Spreadsheet, name env.Word, extractCols env.Block, code env.Block) env.Object {
Expand Down Expand Up @@ -668,7 +752,7 @@ func GenerateColumn(ps *env.ProgramState, s env.Spreadsheet, name env.Word, extr
return s
}

func GenerateColumnRegexReplace(ps *env.ProgramState, s env.Spreadsheet, name env.Word, fromColName env.Word, re *regexp.Regexp, pattern string) env.Object {
func GenerateColumnRegexReplace(ps *env.ProgramState, s *env.Spreadsheet, name env.Word, fromColName env.Word, re *regexp.Regexp, pattern string) env.Object {
// add name to columns
s.Cols = append(s.Cols, ps.Idx.GetWord(name.Index))
for ix, row := range s.Rows {
Expand All @@ -690,7 +774,7 @@ func GenerateColumnRegexReplace(ps *env.ProgramState, s env.Spreadsheet, name en
row.Values = append(row.Values, newVal)
s.Rows[ix] = row
}
return s
return nil
}

func AddIndexes(ps *env.ProgramState, s *env.Spreadsheet, columns []env.Word) env.Object {
Expand Down Expand Up @@ -938,3 +1022,66 @@ func AutoType(ps *env.ProgramState, s *env.Spreadsheet, percent float64) env.Obj

return *newS
}

func LeftJoin(ps *env.ProgramState, s1 env.Spreadsheet, s2 env.Spreadsheet, col1 string, col2 string, innerJoin bool) env.Object {
if !slices.Contains(s1.Cols, col1) {
return MakeBuiltinError(ps, "Column not found in first spreadsheet.", "left-join")
}
if !slices.Contains(s2.Cols, col2) {
return MakeBuiltinError(ps, "Column not found in second spreadsheet.", "left-join")
}

combinedCols := make([]string, len(s1.Cols)+len(s2.Cols))
copy(combinedCols, s1.Cols)
for i, v := range s2.Cols {
if slices.Contains(combinedCols, v) {
combinedCols[i+len(s1.Cols)] = v + "_2"
} else {
combinedCols[i+len(s1.Cols)] = v
}
}
nspr := env.NewSpreadsheet(combinedCols)
for _, row1 := range s1.GetRows() {
val1, err := s1.GetRowValue(col1, row1)
if err != nil {
return MakeError(ps, "Couldn't retrieve value at row")
}
newRow := make([]any, len(combinedCols))

// the row id of the second spreadsheet that matches the current row
s2RowId := -1
// use index if available
if ix, ok := s2.Indexes[col2]; ok {
if rowIds, ok := ix[val1]; ok {
// if there are multiple rows with the same value (ie. joining on non-unique column), just use the first one
s2RowId = rowIds[0]
}
} else {
for i, row2 := range s2.GetRows() {
val2, err := s2.GetRowValue(col2, row2)
if err != nil {
return MakeError(ps, "Couldn't retrieve value at row")
}
if val1.(env.Object).Equal(val2.(env.Object)) {
s2RowId = i
break
}
}
}
if innerJoin && s2RowId == -1 {
continue
}
copy(newRow, row1.Values)
if s2RowId > -1 {
for i, v := range s2.GetRow(ps, s2RowId).Values {
newRow[i+len(s1.Cols)] = v
}
} else {
for i := range s2.Cols {
newRow[i+len(s1.Cols)] = env.Void{}
}
}
nspr.AddRow(*env.NewSpreadsheetRow(newRow, nspr))
}
return *nspr
}
47 changes: 46 additions & 1 deletion tests/structures.rye
Original file line number Diff line number Diff line change
Expand Up @@ -572,7 +572,7 @@ section "Serializers and loaders"


section "Spreadsheet related functions"
"Functions for handling and working with Context."
"Functions for creating and working with spreadsheets."
{

group "spreadsheet & related"
Expand Down Expand Up @@ -608,6 +608,51 @@ section "Spreadsheet related functions"
equal { to-spreadsheet vals { dict { "a" 1 b 2 } dict { "a" 3 "b" 4 } } } spreadsheet { "a" "b" } { 1 2 3 4 }
equal { to-spreadsheet vals { dict { "a" 1 b 2 "c" 3 } dict { "a" 4 "b" 5 } } } spreadsheet { "a" "b" "c" } { 1 2 3 4 5 _ }
}

group "index"
mold\nowrap ?add-indexes!
{ { block } }
{
; returned value
equal { spr: spreadsheet { "a" "b" } { 1 2 3 4 } |add-indexes! [ 'a ] |indexes? } { "a" }
; in-place
; TODO this should work but doesn't, index should be added in place but for some reason it isn't
; equal { spr: spreadsheet { "a" "b" } { 1 2 3 4 } , spr .add-indexes! [ 'a ] , spr .indexes? } { "a" }
}

group "left join"
mold\nowrap ?left-join
{ { block } }
{
equal { names: spreadsheet { "id" "name" } { 1 "Paul" 2 "Chani" 3 "Vladimir" } ,
houses: spreadsheet { "id" "house" } { 1 "Atreides" 3 "Harkonnen" 4 "Corrino" } ,
names .left-join houses 'id 'id
} spreadsheet { "id" "name" "id_2" "house" } { 1 "Paul" 1 "Atreides" 2 "Chani" _ _ 3 "Vladimir" 3 "Harkonnen" }

; joining with an index on the second spreadsheet
equal { names: spreadsheet { "id" "name" } { 1 "Paul" 2 "Chani" 3 "Vladimir" } ,
houses: spreadsheet { "id" "house" } { 1 "Atreides" 3 "Harkonnen" } ,
houses .add-indexes! [ 'id ] :houses ,
names .left-join houses 'id 'id
} spreadsheet { "id" "name" "id_2" "house" } { 1 "Paul" 1 "Atreides" 2 "Chani" _ _ 3 "Vladimir" 3 "Harkonnen" }
}

group "inner join"
mold\nowrap ?inner-join
{ { block } }
{
equal { names: spreadsheet { "id" "name" } { 1 "Paul" 2 "Chani" 3 "Vladimir" } ,
houses: spreadsheet { "id" "house" } { 1 "Atreides" 3 "Harkonnen" 4 "Corrino" } ,
names .inner-join houses 'id 'id
} spreadsheet { "id" "name" "id_2" "house" } { 1 "Paul" 1 "Atreides" 3 "Vladimir" 3 "Harkonnen" }

; joining with an index on the second spreadsheet
equal { names: spreadsheet { "id" "name" } { 1 "Paul" 2 "Chani" 3 "Vladimir" } ,
houses: spreadsheet { "id" "house" } { 1 "Atreides" 3 "Harkonnen" } ,
houses .add-indexes! [ 'id ] :houses ,
names .inner-join houses 'id 'id
} spreadsheet { "id" "name" "id_2" "house" } { 1 "Paul" 1 "Atreides" 3 "Vladimir" 3 "Harkonnen" }
}
}


Expand Down

0 comments on commit ada2ef3

Please sign in to comment.