-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy path08-Advanced-Lua-Scripts.yaml
179 lines (163 loc) · 10 KB
/
08-Advanced-Lua-Scripts.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
name: Advanced Lua Scripts
description: This charter builds on the Lua customization allowed in the data-cleansing and matching rules.
version: 1637208553000
debug: true
# The global_lua section of the charter allows Lua functions to be defined in one place and re-used in any
# appropriate Lua section elsewhere in the charter.
#
# Note: For any multi-line Lua, use the | yaml symbol to put multi-line values on the following lines. Just
# make sure to indent it.
global_lua: |
-- Remove all digits from the string.
function no_number(s)
return s:gsub("%d", "")
end
-- Trim all surrounding whitespace from the string. ref: http://lua-users.org/wiki/StringTrim
function trim(s)
return s:gsub("^%s*(.-)%s*$", "%1")
end
-- These are filter functions used in custom constraints later. They return true if the record is of the
-- appropriate type. Don't worry about them for now, they should make more sense when you get to the custom
-- constraints.
invoices = function (record) return record["META.prefix"] == "INV" end
payments = function (record) return record["META.prefix"] == "PAY" end
jetwash:
source_files:
- pattern: ^08-invoices\.csv
column_mappings:
# This column mapping transforms the InvoiceRef column by removing any numeric digits and then trimming
# any surrounding whitespace from the remaining string. The functions trim and no_number are defined in
# the global_lua section above. The data is transformed and the files in the waiting folder will have
# the modified values in so the matching instructions below have access to them.
#
# Note: There is a jetwash helper called trim which doesn't require any Lua and is much neater to write.
# In this case though, because I want to perform Lua on the value, I have to use a Lua trim function in
# addition to my no_number function.
- map:
column: InvoiceRef
as_a: String
# In the Lua below, the variable 'value' is set to the current record's value for the column being
# mapped. It is always a String data-type in jetwash regardless of the actual column's data-type.
from: trim(no_number(value))
- pattern: ^08-payments\.csv
new_columns:
# Okay this next bit is pretty neat but there's quite a bit of new stuff going on so lets unpack it. Firstly
# we are creating a new column in the jetwash process which will categorize the payments by their amount.
# In the mapping above the Lua script had access to the 'value' variable. For new_columns, the Lua script
# has access to a variable 'record' and each column's current value. So below, for example we are reading
# the current record's 'PaymentAmount' column, but we could read any of them.
#
# As with column_mappings, the values from the record will always be a String within the Lua script -
# although we are not constrained to returning strings from our mappings - for example if we wanted to
# parse a non-standard date format and return a Datetime then, although the source values would be a string
# We are allowed to return numerics and dates.
#
# Because record["PaymentAmount"] is a string, we use the internal helper function 'decimal' to parse the
# value into a decimal and then compare that to a literal number. Note: we also wrap 500.0 as a decimal
# because Lua only understands floating point numbers, so the 500.0 below is a float and we want a precise
# decimal data-type to avoid precision loss.
#
# The end result of this operation is to append a new column to each payment called 'Size' (or 'PAY.Size'
# if you're in the matching section below) which is either 'BIG' or 'SMALL'.
- column: Size
as_a: String
from: |
-- If the payment is over 500 it's a BIG payment.
if decimal(record["PaymentAmount"]) > decimal(500.0) then
return "BIG"
else
return "SMALL"
end
matching:
source_files:
- pattern: .*08-invoices.*\.csv
field_prefix: INV
- pattern: .*08-payments.*\.csv
field_prefix: PAY
instructions:
- merge:
columns: ['INV.InvoiceAmount', 'PAY.PaymentAmount']
into: AMOUNT
- merge:
columns: ['INV.InvoiceDate', 'PAY.PaymentDate']
into: INVOICE_DATE
- merge:
columns: ['INV.InvoiceRef', 'PAY.PaymentRef']
into: REF
- group:
by: ['REF']
match_when:
# Up until now, we've only used the built-in constraints nets_to_zero and nets_with_tolerance. Here we'll start to
# explore the custom constraint type which uses a Lua script to evaluate a group's eligibility for a match.
- custom:
# The Lua script will have access to all the record data in a potential grouping and can call certain pre-baked
# Lua aggregate functions (see further examples below) on the group.
script: |
-- Amazingly you can debug print from Lua into the application's main log - obviously these need to be
-- removed before running a production system - but they can be a great way to debug issues in your scripts.
-- commented out to avoid spam in the integration tests.
--print("Lua debugging works in the console output too!")
-- The 'records' Lua table contains all the records in the potential group. You can iterate them with the
-- standard Lua pairs and ipairs functions. Also note: Lua array indexes start at 1 not 0.
for idx, record in ipairs(records) do
-- Each record is a table of field values which you can also reference and/or iterate.
for field, value in pairs(record) do
-- commented out to avoid spam in the integration tests.
-- print("Record [" ..idx.. "] has a value of [" .. value .. "] in field [" .. field .. "]")
end
end
-- There are also some additional metadata fields on the record which contains record context
-- details, such as the 'prefix' - the applied field_prefix from the source_files section of the charter.
-- The meta table also contains 'filename' and the 'timestamp' extracted from the filename. i.e.
-- record["META.prefix"]
-- record["META.timestamp"] << Unix Epoch millisecond timestamp.
-- record["META.filename"]
-- Ultimately a constraint must return true or false - because this one is only demonstrating the Lua
-- integration and not actually evaluating anything, we'll return true so other constraints can be
-- evaluated.
return true
- custom:
# By default a custom constraint will have access to all fields on a record. This can result in a performance
# hit on large numbers of columns and records. So to alleviate this, when the script is complete and ready
# for production, you can use the 'available_fields' array to restrict which columns on a record are available
# in the Lua script below. You should strive to keep this fields array as small as possible
available_fields: ['INVOICE_DATE', 'INV.InvoiceRef', 'PAY.PaymentRef']
script: |
-- commented out to avoid spam in the integration tests.
-- print("Hello from Lua - in custom constraint 2 - there will be less fields output than the previous constraint")
for idx, record in ipairs(records) do
-- Each record is a table of field values which you can also reference or iterate.
for field, value in pairs(record) do
-- commented out to avoid spam in the integration tests.
-- print("Record [" ..idx.. "] F [" ..field.. "] V [" ..value.. "]")
end
end
return true
# Custom constraint scripts have access to these aggregate functions which operate on a group of
# records being evaluated for a match: -
# count
# sum (sum_int for integers)
# min (min_int for integers)
# max (max_int for integers)
- custom:
# Here we will allow one (and only one) invoice to match against one or more payments.
script: |
-- count is a function that takes a filter as an argument (defined in the global Lua section) and
-- counts the number of records in the group matching the filter.
return count(invoices) == 1 and count(payments) > 0
# This is an example of writing a nets-to-zero constraint manually in Lua script. You could adapt this for example,
# if your data needed to be sensitive to numerical signs or the transactions had an ours/theirs indicator field.
- custom:
# This is an example of writing a nets-to-zero constraint manually in Lua script.
script: |
-- sum is a function that takes 2 arguments. The first is the name of a column on the records.
-- the second is a filter to apply to the records in the group. If you wish to sum an integer column then
-- use the sum_int function instead.
local inv_sum = sum("AMOUNT", invoices)
local pay_sum = sum("AMOUNT", payments)
-- Finally, our group matches if it has all types of records and invoices net to zero with
-- the payments. Whilst Lua has it's own Math.abs() function, the version used below is a specialized
-- version designed to work with the Decimal data-type returned by the sum aggregate functions above.
return count(invoices) > 0
and count(payments) > 0
and abs((abs(inv_sum) - abs(pay_sum))) == decimal(0)