diff --git a/.version b/.version index 47b5c29edf..9966253da3 100644 --- a/.version +++ b/.version @@ -1 +1 @@ -2022.01 +2022.07 \ No newline at end of file diff --git a/default.nix b/default.nix index f2536f0e2a..5e0b4252cb 100644 --- a/default.nix +++ b/default.nix @@ -19,11 +19,7 @@ stdenv.mkDerivation rec { patchPhase = '' patchShebangs . - - # some hardcodeism - for f in $(find src/program/snabbnfv/ -type f); do - substituteInPlace $f --replace "/bin/bash" "${bash}/bin/bash" - done + '' + lib.optionalString supportOpenstack '' # We need a way to pass $PATH to the scripts sed -i '2iexport PATH=${git}/bin:${mariadb}/bin:${which}/bin:${procps}/bin:${coreutils}/bin' src/program/snabbnfv/neutron_sync_master/neutron_sync_master.sh.inc diff --git a/lib/ljsyscall/syscall/linux/syscalls.lua b/lib/ljsyscall/syscall/linux/syscalls.lua index 8766481376..0282a904fd 100644 --- a/lib/ljsyscall/syscall/linux/syscalls.lua +++ b/lib/ljsyscall/syscall/linux/syscalls.lua @@ -814,17 +814,22 @@ function S.setegid(egid) return S.setresgid(-1, egid, -1) end -- note currently all returned as strings, may want to list which should be numbers function S.sysctl(name, new) name = "/proc/sys/" .. name:gsub("%.", "/") - local flag = c.O.RDONLY - if new then flag = c.O.RDWR end - local fd, err = S.open(name, flag) + local fd, err = S.open(name, c.O.RDONLY) if not fd then return nil, err end local len = 1024 local old, err = S.read(fd, nil, len) if not old then return nil, err end old = old:sub(1, #old - 1) -- remove trailing newline + local ok, err = S.close(fd) + if not ok then return nil, err end if not new then return old end + -- Reopen fd because we want to write at pos 0 + local fd, err = S.open(name, c.O.WRONLY) + if not fd then return nil, err end local ok, err = S.write(fd, new) if not ok then return nil, err end + local ok, err = S.close(fd) + if not ok then return nil, err end return old end diff --git a/lib/luajit/.gitignore b/lib/luajit/.gitignore index 1a07bf75bf..a0e6e84e23 100644 --- a/lib/luajit/.gitignore +++ b/lib/luajit/.gitignore @@ -9,3 +9,12 @@ *.dmp *.swp .tags +*.dwo +/src/lj_bcdef.h +/src/lj_ffdef.h +/src/lj_folddef.h +/src/lj_libdef.h +/src/lj_recdef.h +/src/lj_vm.S +/src/raptorjit +/src/host/buildvm_arch.h diff --git a/src/README.md b/src/README.md index e02bddb988..62ac156f16 100644 --- a/src/README.md +++ b/src/README.md @@ -55,7 +55,7 @@ these in a desired way using *links* and finally pass the resulting app network on to the Snabb engine. The engine's job is to: * Pump traffic through the app network - * Keep the app network running (e.g. restart failed apps) + * Apply, and inform apps of configuration and link changes * Report on the network status @@ -117,11 +117,24 @@ will be used to validate the app’s arg when it is configured using `config.app`. -— Method **myapp:link** +— Method **myapp:link** *dir* *name* -*Optional*. Called any time the app’s links may have been changed (including on -start-up). Guaranteed to be called before `pull` and `push` are called with new -links. +*Optional*. Called during `engine.configure()` when a link of the app is +added. Unless `unlink` is specified this method is also called when a link +is removed. +Guaranteed to be called before `pull` and `push` are called with new links. + +*Dir* is either `'input'` or `'output'`, and *name* is the string name +of the link. I.e., the added link can be accessed at `self[dir][name]`. + + +— Method **myapp:unlink** *dir* *name* + +*Optional*. Called during `engine.configure()` when a link of the app is +removed. + +*Dir* is either `'input'` or `'output'`, and *name* is the string name +of the link. — Method **myapp:pull** @@ -139,6 +152,53 @@ transmitting them to output ports. For example: Move packets from input ports to output ports or to a network adapter. +— Field **myapp.push_link** + +*Optional*. When specified must be a table of per-link `push()` methods +that take an input link as an argument. For example an app could specify +a **push_link** method for its input link *foo*: + +``` +Myapp = { push_link={} } +function Myapp.push_link:foo (input) + while not link.empty(input) do something() end +end +``` + +**Push_link** methods are copied to a fresh table when the app is started, +and it is valid to create **push_link** methods dynamically during `link()`, +for example like so: + +``` +Myapp = { push_link={} } +function Myapp:link (dir, name) + -- NB: Myapp.push_link ~= self.push_link + if dir == 'input' then + self.push_link[name] = function (self, input) + while not link.empty(input) do something() end + end + end +end +function Myapp:unlink (dir, name) + if dir == 'input' then + self.push_link[name] = nil + end +end +``` + +**Push** is not called when an app has **push_link** methods +for *all* of its input links. If, however, an app at least one input link +without an associated **push_link** method then **push** is called +in addition to the **push_link** methods. + + +— Method **myapp:tick** + +*Optional*. Called periodically at **engine.tick_Hz** frequency. + +For example: Move packets from input ports to output ports or to a +network adapter. + — Method **myapp:reconfig** *arg* @@ -285,6 +345,13 @@ how many times per second to poll. This setting is not used when engine.busywait is true. +— Variable **engine.tick_Hz** + +Frequency at which to call **app:tick** methods. The default value is +1000 (call `tick()`s every millisecond). + +A value of 0 effectively disables `tick()` methods. + ## Link (core.link) A *link* is a [ring buffer](http://en.wikipedia.org/wiki/Circular_buffer) diff --git a/src/apps/interlink/freelist_instrument.lua b/src/apps/interlink/freelist_instrument.lua new file mode 100644 index 0000000000..4070a1063d --- /dev/null +++ b/src/apps/interlink/freelist_instrument.lua @@ -0,0 +1,40 @@ +-- Use of this source code is governed by the Apache 2.0 license; see COPYING. + +module(...,package.seeall) + +local histogram = require("core.histogram") +local tsc = require("lib.tsc") + +function instrument_freelist () + local ts = tsc.new() + local rebalance_latency = histogram.create('engine/rebalance_latency.histogram', 1, 100e6) + local reclaim_latency = histogram.create('engine/reclaim_latency.histogram', 1, 100e6) + + local rebalance_step, reclaim_step = packet.rebalance_step, packet.reclaim_step + packet.rebalance_step = function () + local start = ts:stamp() + rebalance_step() + rebalance_latency:add(tonumber(ts:to_ns(ts:stamp()-start))) + end + packet.reclaim_step = function () + local start = ts:stamp() + reclaim_step() + reclaim_latency:add(tonumber(ts:to_ns(ts:stamp()-start))) + end + + return rebalance_latency, reclaim_latency +end + +function histogram_csv_header (out) + out = out or io.stdout + out:write("histogram,lo,hi,count\n") +end + +function histogram_csv (histogram, name, out) + out = out or io.stdout + name = name or 'untitled' + for count, lo, hi in histogram:iterate() do + out:write(("%s,%f,%f,%d\n"):format(name, lo, hi, tonumber(count))) + out:flush() + end +end \ No newline at end of file diff --git a/src/apps/interlink/test_sink.lua b/src/apps/interlink/test_sink.lua new file mode 100644 index 0000000000..d72f072f5a --- /dev/null +++ b/src/apps/interlink/test_sink.lua @@ -0,0 +1,38 @@ +-- Use of this source code is governed by the Apache 2.0 license; see COPYING. + +module(...,package.seeall) + +local Receiver = require("apps.interlink.receiver") +local Sink = require("apps.basic.basic_apps").Sink +local lib = require("core.lib") +local numa = require("lib.numa") + +function configure (c, name) + config.app(c, name, Receiver) + config.app(c, "sink", Sink) + config.link(c, name..".output -> sink.input") +end + +function start (name, duration) + local c = config.new() + configure(c, name) + engine.configure(c) + engine.main{duration=duration} +end + +local instr = require("apps.interlink.freelist_instrument") + +function start_instrument (name, duration, core) + numa.bind_to_cpu(core, 'skip') + local rebalance_latency = instr.instrument_freelist() + start(name, duration) + instr.histogram_csv(rebalance_latency, "rebalance") + local min, avg, max = rebalance_latency:summarize() + io.stderr:write(("(%d) rebalance latency (ns) min:%16s avg:%16s max:%16s\n") + :format(core, + lib.comma_value(math.floor(min)), + lib.comma_value(math.floor(avg)), + lib.comma_value(math.floor(max)))) + io.stderr:flush() +end + diff --git a/src/apps/interlink/test_source.lua b/src/apps/interlink/test_source.lua index cfa71b7417..ecfdd664fc 100644 --- a/src/apps/interlink/test_source.lua +++ b/src/apps/interlink/test_source.lua @@ -4,12 +4,56 @@ module(...,package.seeall) local Transmitter = require("apps.interlink.transmitter") local Source = require("apps.basic.basic_apps").Source +local lib = require("core.lib") +local numa = require("lib.numa") -function start (name) - local c = config.new() +function configure (c, name) config.app(c, name, Transmitter) config.app(c, "source", Source) - config.link(c, "source.output -> "..name..".input") + config.link(c, "source."..name.." -> "..name..".input") +end + +function start (name, duration) + local c = config.new() + configure(c, name) engine.configure(c) - engine.main() + engine.main{duration=duration} end + +function startn (name, duration, n) + local c = config.new() + for i=1,n do + configure(c, name..i) + end + engine.configure(c) + engine.main{duration=duration} +end + +function txpackets () + local txpackets = 0 + for _, output in ipairs(engine.app_table["source"].output) do + txpackets = txpackets + link.stats(output).rxpackets + end + return txpackets +end + +local instr = require("apps.interlink.freelist_instrument") + +function startn_instrument (name, duration, n, core) + numa.bind_to_cpu(core, 'skip') + local _, reclaim_latency = instr.instrument_freelist() + startn(name, duration, n) + local txpackets = txpackets() + instr.histogram_csv(reclaim_latency, "reclaim") + local min, avg, max = reclaim_latency:summarize() + engine.main{duration=1, no_report=true} + io.stderr:write(("(%d) reclaim latency (ns) min:%16s avg:%16s max:%16s\n") + :format(core, + lib.comma_value(math.floor(min)), + lib.comma_value(math.floor(avg)), + lib.comma_value(math.floor(max)))) + io.stderr:write(("%.3f Mpps\n"):format(txpackets / 1e6 / duration)) + io.stderr:flush() + + --engine.report_links() +end \ No newline at end of file diff --git a/src/apps/interlink/wait_test.snabb b/src/apps/interlink/wait_test.snabb new file mode 100755 index 0000000000..ebe0cc0d94 --- /dev/null +++ b/src/apps/interlink/wait_test.snabb @@ -0,0 +1,36 @@ +#!snabb snsh + +-- Use of this source code is governed by the Apache 2.0 license; see COPYING. + +local worker = require("core.worker") +local numa = require("lib.numa") + +-- Test wait times caused by group freelist rebalancing +-- Synopsis: wait_test.snabb [duration] [nconsumers] +local DURATION = tonumber(main.parameters[1]) or 10 +local NCONSUMERS = tonumber(main.parameters[2]) or 10 +local CPUS = numa.parse_cpuset(main.parameters[3] or "") + +local cores = {} +for core in pairs(CPUS) do + table.insert(cores, core) + table.sort(cores) +end + +require("apps.interlink.freelist_instrument").histogram_csv_header() +io.stdout:flush() + +for i=1,NCONSUMERS do + worker.start("sink"..i, ([[require("apps.interlink.test_sink").start_instrument(%q, %d, %s)]]) + :format("test"..i, DURATION, cores[1+i])) +end + +worker.start("source", ([[require("apps.interlink.test_source").startn_instrument(%q, %d, %d, %s)]]) + :format("test", DURATION, NCONSUMERS, assert(cores[1]))) + +engine.main{done = function () + for w, s in pairs(worker.status()) do + if s.alive then return false end + end + return true +end} \ No newline at end of file diff --git a/src/apps/ipfix/README.md b/src/apps/ipfix/README.md index be07a179c4..da8bba5c65 100644 --- a/src/apps/ipfix/README.md +++ b/src/apps/ipfix/README.md @@ -33,11 +33,31 @@ idle and available for expiry. The default is 300 seconds. *Optional*. Period at which an active, non-idle flow should produce export records. The default is 120 seconds. +— Key **flush_timeout** + +*Optional*. Maximum number of seconds after which queued data records +are exported. If set to a positive value, data records are queued +until a flow export packet of maximum size according to the configured +**mtu** can be generated or **flush_timeout** seconds have passed +since the last export packet was generated, whichever occurs first. +If set to zero, data records are exported immediately after each scan +of the flow cache. The default is 10 seconds. + — Key **cache_size** *Optional*. Initial size of flow tables, in terms of number of flows. The default is 20000. +— Key **scan_time** + +*Optional*. The flow cache for every configured template is scanned +continously to check for entries eligible for export based on the +**idle_timeout** and **active_timeout** parameters. The **scan_time** +determines the interval in seconds that a scan of the entire flow +cache will take. The implementation uses a token bucket mechanism by +which access to the tables is distributed evenly over the time +interval. The default is 10 seconds. + — Key **template_refresh_interval** *Optional*. Period at which to send template records over UDP. The diff --git a/src/apps/ipfix/dns.lua b/src/apps/ipfix/dns.lua new file mode 100644 index 0000000000..64defa1199 --- /dev/null +++ b/src/apps/ipfix/dns.lua @@ -0,0 +1,307 @@ +module(..., package.seeall) + +local ffi = require("ffi") +local lib = require("core.lib") + +-- By default, CNAME and RRSIG records in the answer section are +-- skipped. +skip_CNAMEs_RRSIGs = true + +local uint16_ptr_t = ffi.typeof("uint16_t *") +local uint8_ptr_t = ffi.typeof("uint8_t *") + +local dns_hdr_t = ffi.typeof([[ + struct { + uint16_t id; + uint16_t flags; + uint16_t qcount; + uint16_t anscount; + uint16_t authcount; + uint16_t addcount; + } __attribute__((packed)) +]]) +local dns_hdr_ptr_t = ffi.typeof("$*", dns_hdr_t) + +-- The part of a RR following the encoded name +local rr_t = ffi.typeof([[ + struct { + uint16_t type; + uint16_t class; + uint32_t ttl; + uint16_t rdlength; + uint8_t rdata[0]; + } __attribute__((packed)) +]]) +local rr_ptr_t = ffi.typeof("$*", rr_t) + +-- Given a region of memory of size size starting at start_ptr, return +-- the number of bytes in the sub-region starting at data_ptr. A +-- result <= 0 indicates that data_ptr is not within the region. +local function available_bytes(start_ptr, size, data_ptr) + assert(data_ptr >= start_ptr) + return size - (data_ptr - start_ptr) +end + +-- Incorrectly compressed domain names can form loops. We use a table +-- that can hold all possible offsets (14 bits) encoded in compressed +-- names to detect such loops in a branch-free fashion by marking +-- which offsets have been encountered while de-compressing a +-- name. The marker consist of a 16-bit number which is increased for +-- each invocation of decompress_name(). The offset table must be +-- reset each time the marker wraps around to zero. Because a valid +-- offset must be at least 12 (due it being relative to the start of +-- the DNS header), the current marker is stored in offset_table[0] +-- without causing a conflict. +local offset_table = ffi.new("uint16_t [16384]") + +-- Decompress the on-the-wire representation of a domain name starting +-- at ptr and write up to size bytes of the decompressed name to the +-- location pointed to by buffer. hdr_ptr is a pointer to the +-- beginning of the DNS header to resolve compressed names. +-- +-- Note that DNS extraction is only initiated if the packet is not +-- truncated. Even then, decompression can lead us out of the message +-- if +-- +-- * the message is corrupt +-- * the messages is fragmented and decompression points into +-- a non-initial fragment +-- +-- msg_size is the number of bytes in the message, including the +-- header, used to check whether decompression stays within the +-- message. +-- +-- Returns a pointer to the first byte after the name or nil if the +-- name could not be decompressed and the number of bytes that have +-- been copied to the buffer. If the pointer is nil, the buffer +-- contains what has been decompressed so far. +local function decompress_name(hdr_ptr, msg_size, ptr, buffer, size) + offset_table[0] = offset_table[0] + 1 + if offset_table[0] == 0 then + ffi.fill(offset_table, ffi.sizeof(offset_table)) + offset_table[0] = 1 + end + + local offset = 0 + if available_bytes(hdr_ptr, msg_size, ptr) < 1 then + return nil, offset + end + local result_ptr = nil + local length = ptr[0] + while length ~= 0 do + local label_type = bit.band(0xc0, length) + if label_type == 0xc0 then + if available_bytes(hdr_ptr, msg_size, ptr) < 2 then + return nil, offset + end + -- Compressed name, length is the offset relative to the start + -- of the DNS message where the remainder of the name is stored + local name_offset = + bit.band(0x3fff, lib.ntohs(ffi.cast(uint16_ptr_t, ptr)[0])) + -- Sanity check and Loop detection + if (name_offset < ffi.sizeof(dns_hdr_t) or name_offset >= msg_size or + offset_table[name_offset] == offset_table[0]) then + return nil, offset + end + offset_table[name_offset] = offset_table[0] + if result_ptr == nil then + -- This is the first redirection encountered in the name, + -- the final result is the location just behind that + -- pointer + result_ptr = ptr + 2 + end + ptr = hdr_ptr + name_offset + elseif label_type ~= 0 then + -- Unsupported/undefined label type + return nil, offset + else + if available_bytes(hdr_ptr, msg_size, ptr) < length + 1 then + -- Truncated label + return nil, offset + end + -- Remaining space in the buffer for the name + local avail = size - offset + if avail > 0 then + -- Copy as much of the label as possible + local eff_length = math.min(length+1, avail) + ffi.copy(buffer + offset, ptr, eff_length) + offset = offset + eff_length + end + ptr = ptr + length + 1 + end + length = ptr[0] + end + -- We've reached the root label + if offset < size then + buffer[offset] = 0 + offset = offset + 1 + end + if result_ptr == nil then + result_ptr = ptr + 1 + end + return result_ptr, offset +end + +-- RDATA with a single domain name +local function decompress_RR_plain(hdr_ptr, msg_size, rr, entry) + local ptr, rdlength = decompress_name(hdr_ptr, msg_size, rr.rdata, + entry.key.dnsAnswerRdata, + ffi.sizeof(entry.key.dnsAnswerRdata)) + entry.key.dnsAnswerRdataLen = rdlength + return ptr +end + +local mx_rdata_t = ffi.typeof([[ + struct { + uint16_t preference; + uint8_t exchange[0]; + } +]]) +local mx_rdata_ptr_t = ffi.typeof("$*", mx_rdata_t) +local function decompress_RR_MX(hdr_ptr, msg_size, rr, entry) + local mx_src = ffi.cast(mx_rdata_ptr_t, rr.rdata) + local mx_dst = ffi.cast(mx_rdata_ptr_t, entry.key.dnsAnswerRdata) + mx_dst.preference = mx_src.preference + local ptr, length = + decompress_name(hdr_ptr, msg_size, mx_src.exchange, + mx_dst.exchange, + ffi.sizeof(entry.key.dnsAnswerRdata) - 2) + local rdlength = length + 2 + entry.key.dnsAnswerRdataLen = rdlength + return ptr +end + +local soa_rdata_t = ffi.typeof([[ + struct { + uint32_t serial; + uint32_t refresh; + uint32_t retry; + uint32_t expire; + uint32_t minimum; + } +]]) +local function decompress_RR_SOA(hdr_ptr, msg_size, rr, entry) + local size = ffi.sizeof(entry.key.dnsAnswerRdata) + local dst = entry.key.dnsAnswerRdata + -- MNAME + local ptr, length = + decompress_name(hdr_ptr, msg_size, rr.rdata, dst, size) + if ptr ~= nil then + local rdlength = ffi.sizeof(soa_rdata_t) + length + local avail = size - length + dst = dst + length + -- RNAME + ptr, length = decompress_name(hdr_ptr, msg_size, ptr, dst, avail) + if ptr ~= nil then + rdlength = rdlength + length + avail = avail - length + dst = dst + length + if avail > 0 then + ffi.copy(dst, ptr, math.min(avail, ffi.sizeof(soa_rdata_t))) + end + entry.key.dnsAnswerRdataLen = rdlength + end + end + return ptr +end + +local function decompress_rdata_none(hdr_ptr, msg_size, rr, entry) + local rdlength = lib.ntohs(rr.rdlength) + ffi.copy(entry.key.dnsAnswerRdata, rr.rdata, + math.min(rdlength, ffi.sizeof(entry.key.dnsAnswerRdata))) + entry.key.dnsAnswerRdataLen = rdlength + return true +end + +-- List of well-known RR types (see RFC3597, section 4) whose RDATA +-- sections can contain compressed names. The functions referenced +-- here replace such names with their uncompressed equivalent. +local decompress_rdata_fns = setmetatable( + { + [2] = decompress_RR_plain, -- NS + [5] = decompress_RR_plain, -- CNAME + [6] = decompress_RR_SOA, -- SOA + [12] = decompress_RR_plain, -- PTR + [15] = decompress_RR_MX, -- MX + }, + { __index = + function() + return decompress_rdata_none + end + } +) +local function extract_answer_rr(hdr_ptr, msg_size, ptr, entry) + local ptr, len = decompress_name(hdr_ptr, msg_size, ptr, + entry.key.dnsAnswerName, + ffi.sizeof(entry.key.dnsAnswerName)) + if ptr == nil then + return nil, nil, nil + end + if available_bytes(hdr_ptr, msg_size, ptr) < ffi.sizeof(rr_t) then + return nil, nil, nil + end + local rr = ffi.cast(rr_ptr_t, ptr) + local type = lib.ntohs(rr.type) + local rdlength = lib.ntohs(rr.rdlength) + if rdlength > 0 then + if available_bytes(hdr_ptr, msg_size, rr.rdata) < rdlength then + return nil, nil, nil + end + if not decompress_rdata_fns[type](hdr_ptr, msg_size, rr, entry) then + return nil, nil, nil + end + end + local class = lib.ntohs(rr.class) + entry.key.dnsAnswerType = type + entry.key.dnsAnswerClass = class + entry.key.dnsAnswerTtl = lib.ntohl(rr.ttl) + return type, class, rr.rdata + rdlength +end + +function extract(hdr_ptr, msg_size, entry) + if ffi.sizeof(dns_hdr_t) > msg_size then + return + end + local dns_hdr = ffi.cast(dns_hdr_ptr_t, hdr_ptr) + entry.key.dnsFlagsCodes = lib.ntohs(dns_hdr.flags) + if lib.ntohs(dns_hdr.qcount) == 1 then + entry.key.dnsQuestionCount = 1 + local ptr, _ = decompress_name(hdr_ptr, msg_size, + hdr_ptr + ffi.sizeof(dns_hdr_t), + entry.key.dnsQuestionName, + ffi.sizeof(entry.key.dnsQuestionName)) + if ptr == nil then + ffi.fill(entry.key.dnsQuestionName, + ffi.sizeof(entry.key.dnsQuestionName)) + return + end + -- The question section only has a type and class + if available_bytes(hdr_ptr, msg_size, ptr) < 4 then + return + end + local rr = ffi.cast(rr_ptr_t, ptr) + entry.key.dnsQuestionType = lib.ntohs(rr.type) + entry.key.dnsQuestionClass = lib.ntohs(rr.class) + ptr = ptr + 4 + local anscount = lib.ntohs(dns_hdr.anscount) + entry.key.dnsAnswerCount = anscount + if anscount > 0 then + -- Extract the first answer + local type, class, ptr = + extract_answer_rr(hdr_ptr, msg_size, ptr, entry) + + -- Skip to the first RR which is neither a CNAME nor a RRSIG + if skip_CNAMEs_RRSIGs then + anscount = anscount - 1 + while (type == 5 or type == 46) and class == 1 and anscount > 0 do + ffi.fill(entry.key.dnsAnswerName, + ffi.sizeof(entry.key.dnsAnswerName)) + ffi.fill(entry.key.dnsAnswerRdata, + ffi.sizeof(entry.key.dnsAnswerRdata)) + type, class, ptr = extract_answer_rr(hdr_ptr, msg_size, ptr, entry) + anscount = anscount - 1 + end + end + end + end +end diff --git a/src/apps/ipfix/ipfix-information-elements-local.inc b/src/apps/ipfix/ipfix-information-elements-local.inc new file mode 100644 index 0000000000..33082a052e --- /dev/null +++ b/src/apps/ipfix/ipfix-information-elements-local.inc @@ -0,0 +1,15 @@ +6,tcpControlBitsReduced,unsigned8,,,,,,,,[RFC5102],,2013-02-18 +2946:100,dnsFlagsCodes,unsigned16,,,,,,,,, +2946:101,dnsQuestionCount,unsigned16,,,,,,,,, +2946:102,dnsAnswerCount,unsigned16,,,,,,,,, +2946:103,dnsQuestionName,octetArray,Domain name in uncompressed on-the-wire encoding,,,,,,,, +2946:104,dnsQuestionType,unsigned16,,,,,,,,, +2946:105,dnsQuestionClass,unsigned16,,,,,,,,, +2946:106,dnsAnswerName,octetArray,Domain name in uncompressed on-the-wire encoding,,"In case of a CNAME chain, +this is the name of the first non-CNAME answer record or the name of the last CNAME record",,,,,, +2946:107,dnsAnswerType,unsigned16,,,,,,,,, +2946:108,dnsAnswerClass,unsigned16,,,,,,,,, +2946:109,dnsAnswerTtl,unsigned32,,,,,,,,, +2946:110,dnsAnswerRdata,octetArray,"On-the-wire encoding of the answer record's rdata section. For well-known record types, +compressed domain names have been replaced with their uncompressed counterparts",,,,,,,, +2946:111,dnsAnswerRdataLen,unsigned16,,,,,,,,, diff --git a/src/apps/ipfix/ipfix.lua b/src/apps/ipfix/ipfix.lua index ccd2b1672f..e022f36096 100644 --- a/src/apps/ipfix/ipfix.lua +++ b/src/apps/ipfix/ipfix.lua @@ -7,20 +7,28 @@ module(..., package.seeall) local bit = require("bit") local ffi = require("ffi") -local pf = require("pf") local template = require("apps.ipfix.template") +local maps = require("apps.ipfix.maps") +local metadata = require("apps.rss.metadata") local lib = require("core.lib") local link = require("core.link") local packet = require("core.packet") +local shm = require("core.shm") +local counter = require("core.counter") local datagram = require("lib.protocol.datagram") local ether = require("lib.protocol.ethernet") +local dot1q = require("lib.protocol.dot1q") local ipv4 = require("lib.protocol.ipv4") local ipv6 = require("lib.protocol.ipv6") local udp = require("lib.protocol.udp") local ctable = require("lib.ctable") +local logger = require("lib.logger") +local token_bucket = require("lib.token_bucket") local C = ffi.C +local S = require("syscall") local htonl, htons = lib.htonl, lib.htons +local metadata_add, metadata_get = metadata.add, metadata.get local debug = lib.getenv("FLOW_EXPORT_DEBUG") @@ -89,21 +97,6 @@ local function padded_length(len) return bit.band(len + max_padding, bit.bnot(max_padding)) end --- Sadly, for NetFlow v9, the header needs to know the number of --- records in a message. So before flushing out a message, a FlowSet --- will append the record count, and then the exporter needs to slurp --- this data off before adding the NetFlow/IPFIX header. -local uint16_ptr_t = ffi.typeof('uint16_t*') -local function add_record_count(pkt, count) - pkt.length = pkt.length + 2 - ffi.cast(uint16_ptr_t, pkt.data + pkt.length)[-1] = count -end -local function remove_record_count(pkt, count) - local count = ffi.cast(uint16_ptr_t, pkt.data + pkt.length)[-1] - pkt.length = pkt.length - 2 - return count -end - -- The real work in the IPFIX app is performed by FlowSet objects, -- which record and export flows. However an IPv4 FlowSet won't know -- what to do with IPv6 packets, so the IPFIX app can have multiple @@ -127,10 +120,55 @@ end FlowSet = {} -function FlowSet:new (template, args) +function FlowSet:new (spec, args) + local t = {} + for s in spec:split(':') do + table.insert(t, s) + end + assert(#t == 1 or #t == 2, "Invalid template specifier: "..spec) + local template_name, cache_size = unpack(t) + assert(template.templates[template_name], + "Undefined template : "..template_name) + if cache_size then + assert(cache_size:match("^%d+$"), + string.format("Invalid cache size for template %s: %s", + template_name, cache_size)) + args.cache_size = tonumber(cache_size) + end + + local template = + template.make_template_info(template.templates[template_name]) + template.logger = logger.new({ date = args.log_date, + module = ("[%5d]"):format(S.getpid()) + .." IPFIX template #"..template.id }) + template.name = template_name + template.maps = {} + for _, name in ipairs(template.require_maps) do + assert(args.maps[name], + string.format("Template #%d: required map %s " + .."not configured", template.id, name)) + template.maps[name] = maps.mk_map(name, args.maps[name], + nil, args.maps_log_fh) + end + + assert(args.active_timeout > args.scan_time, + string.format("Template #%d: active timeout (%d) " + .."must be larger than scan time (%d)", + template.id, args.active_timeout, + args.scan_time)) + assert(args.idle_timeout > args.scan_time, + string.format("Template #%d: idle timeout (%d) " + .."must be larger than scan time (%d)", + template.id, args.idle_timeout, + args.scan_time)) local o = { template = template, + flush_timer = (args.flush_timeout > 0 and + lib.throttle(args.flush_timeout)) + or function () return true end, idle_timeout = assert(args.idle_timeout), - active_timeout = assert(args.active_timeout) } + active_timeout = assert(args.active_timeout), + scan_time = args.scan_time, + parent = assert(args.parent) } if args.version == 9 then o.template_id = V9_TEMPLATE_ID elseif args.version == 10 then o.template_id = V10_TEMPLATE_ID @@ -153,34 +191,123 @@ function FlowSet:new (template, args) local params = { key_type = template.key_t, value_type = template.value_t, - max_occupancy_rate = 0.4, + max_occupancy_rate = args.max_load_factor, + resize_callback = function(table, old_size) + if old_size > 0 then + template.logger:log("resize flow cache "..old_size.. + " -> "..table.size) + end + require('jit').flush() + o.table_tb:set(math.ceil(table.size / o.scan_time)) + end, + max_displacement_limit = 30 } if args.cache_size then - params.initial_size = math.ceil(args.cache_size / 0.4) + params.initial_size = math.ceil(args.cache_size / args.max_load_factor) end + o.table_tb = token_bucket.new({ rate = 1 }) -- Will be set by resize_callback o.table = ctable.new(params) + o.table_tstamp = C.get_unix_time() + o.table_scan_time = 0 o.scratch_entry = o.table.entry_type() o.expiry_cursor = 0 + o.scan_protection = args.scan_protection + local sp = { table = {} } + if args.scan_protection.enable then + aggr_info = template.aggregate_info + sp.aggr_key_fn, sp.ntop_fn = aggr_info.mk_fns( + args.scan_protection.aggregate_v4, + args.scan_protection.aggregate_v6 + ) + -- Will be set by resize_callback + sp.table_tb = token_bucket.new({ rate = 1 }) + sp.export_rate_tb = token_bucket.new( + { rate = args.scan_protection.export_rate }) + sp.table = ctable.new({ + key_type = aggr_info.key_type, + value_type = ffi.typeof([[ + struct { + uint8_t suppress; + uint64_t tstamp; + uint64_t flow_count; + uint64_t packets; + uint64_t octets; + uint64_t tstamp_drop_start; + uint64_t drops; + uint64_t exports; + } __attribute__((packed)) + ]]), + initial_size = args.scan_protection.cache_size, + max_occupancy_rate = args.scan_protection.max_load_factor, + resize_callback = function(table, old_size) + if old_size > 0 then + template.logger:log("resize flow rate tracking cache " + ..old_size.." -> "..table.size) + end + require('jit').flush() + sp.table_tb:set( + math.ceil(table.size / args.scan_protection.interval) + ) + end, + max_displacement_limit = 30 + }) + sp.expiry_cursor = 0 + sp.scratch_entry = sp.table.entry_type() + end + o.sp = sp + o.match = template.match o.incoming_link_name, o.incoming = new_internal_link('IPFIX incoming') + -- Generic per-template counters + local shm_name = "ipfix_templates/"..args.instance.."/"..template.id + local frame_init = { + packets_in = { counter, 0 }, + flow_export_packets = { counter, 0 }, + exported_flows = { counter, 0 }, + table_scan_time = { counter, 0 }, + } + local function add_table_counters(prefix, table) + for _, item in ipairs({ 'size', 'byte_size', + 'occupancy', 'max_displacement' }) do + frame_init[prefix..'_'..item] = { counter, table[item] } + end + end + add_table_counters('table', o.table) + add_table_counters('rate_table', o.sp.table) + o.shm = shm.create_frame(shm_name, frame_init) + + -- Template-specific counters + if template.counters then + local conf = {} + for name, _ in pairs(template.counters) do + conf[name] = { counter, 0 } + end + o.shm_template = + shm.create_frame(shm_name.."/stats", conf) + end return setmetatable(o, { __index = self }) end +function FlowSet:id() + return string.format("%s(#%d)", self.template.name, self.template.id) +end + function FlowSet:record_flows(timestamp) local entry = self.scratch_entry timestamp = to_milliseconds(timestamp) for i=1,link.nreadable(self.incoming) do local pkt = link.receive(self.incoming) - self.template.extract(pkt, timestamp, entry) - packet.free(pkt) + counter.add(self.shm.packets_in) + self.template:extract(pkt, timestamp, entry) local lookup_result = self.table:lookup_ptr(entry.key) if lookup_result == nil then self.table:add(entry.key, entry.value) else - self.template.accumulate(lookup_result, entry) + self.template:accumulate(lookup_result, entry, pkt) end + packet.free(pkt) end end @@ -206,6 +333,7 @@ function FlowSet:add_data_record(record, out) ffi.copy(ptr, record, record_len) self.template.swap_fn(ffi.cast(self.template.record_ptr_t, ptr)) pkt.length = pkt.length + record_len + counter.add(self.shm.exported_flows) self.record_count = self.record_count + 1 if self.record_count == self.max_record_count then @@ -230,9 +358,11 @@ function FlowSet:flush_data_records(out) set_header.id = htons(self.template.id) set_header.length = htons(pkt.length) - -- Add record count and push. - add_record_count(pkt, record_count) + -- Add headers provided by the IPFIX object that created us + pkt = self.parent:add_ipfix_header(pkt, record_count) + pkt = self.parent:add_transport_headers(pkt) link.transmit(out, pkt) + counter.add(self.shm.flow_export_packets) end -- Print debugging messages for a flow. @@ -245,50 +375,203 @@ function FlowSet:debug_flow(entry, msg) end end +function FlowSet:expire_flow_rate_records(now) + if not self.scan_protection.enable then + return + end + local cursor = self.sp.expiry_cursor + local now_ms = to_milliseconds(now) + local interval = to_milliseconds(self.scan_protection.interval) + for i = 1, self.sp.table_tb:take_burst() do + local entry + cursor, entry = self.sp.table:next_entry(cursor, cursor + 1) + if entry then + if now_ms - tonumber(entry.value.tstamp) > 2*interval then + self.sp.table:remove_ptr(entry) + else + cursor = cursor + 1 + end + end + end + self.sp.expiry_cursor = cursor +end + +local function reset_rate_entry(entry, flow_entry, timestamp) + entry.value.tstamp = timestamp + entry.value.flow_count = 1 + entry.value.packets = flow_entry.value.packetDeltaCount + entry.value.octets = flow_entry.value.octetDeltaCount +end + +local function reset_drop_stats(entry, timestamp) + entry.value.drops = 0 + entry.value.exports = 0 + entry.value.tstamp_drop_start = timestamp +end + +-- To implement the scan-protection feature, we keep track of flows +-- that satisfy the configured criteria for packets-per-flow (ppf) and +-- bytes-per-packet (bpp) per prefix aggregate (defaulting to /24 and +-- /64 for IPv4 and IPv6, respectively). +function FlowSet:suppress_flow(flow_entry, timestamp) + local config = self.scan_protection + if not config.enable then + return false + end + + -- Only consider flows that satisfy the ppf and bpp criteria + local ppf = flow_entry.value.packetDeltaCount + local bpp = flow_entry.value.octetDeltaCount/ppf + if (ppf > config.max_packets_per_flow or bpp > config.max_bytes_per_packet) then + return false + end + + local entry = self.sp.scratch_entry + self.sp.aggr_key_fn(flow_entry.key, entry.key) + local result = self.sp.table:lookup_ptr(entry.key) + if result then + local aggr = result.value + local interval = tonumber(timestamp - aggr.tstamp)/1000 + if interval >= config.interval then + local fps = aggr.flow_count/interval + local drop_interval = (timestamp - aggr.tstamp_drop_start)/1000 + if (fps >= config.threshold_rate) then + local aggr_ppf = aggr.packets/aggr.flow_count + local aggr_bpp = aggr.octets/aggr.packets + if aggr.suppress == 0 then + self.template.logger:log( + string.format("Flow rate threshold exceeded from %s: ".. + "%d fps, %d bpp, %d ppf", + self.sp.ntop_fn(entry.key), + tonumber(fps), tonumber(aggr_bpp), tonumber(aggr_ppf))) + reset_drop_stats(result, timestamp) + aggr.suppress = 1 + elseif drop_interval > config.report_interval then + self.template.logger:log( + string.format("Flow rate report for %s: ".. + "%d fps, %d bpp, %d ppf, %d flows dropped, ".. + "%d exported in past %d seconds", + self.sp.ntop_fn(entry.key), + tonumber(fps), tonumber(aggr_bpp), tonumber(aggr_ppf), + tonumber(aggr.drops), + tonumber(aggr.exports), + tonumber(drop_interval))) + reset_drop_stats(result, timestamp) + end + else + if aggr.suppress == 1 then + self.template.logger:log( + string.format("Flow rate below threshold from %s: ".. + "%d flows dropped, %d exported in past ".. + "%d seconds ", + self.sp.ntop_fn(entry.key), + tonumber(aggr.drops), + tonumber(aggr.exports), + tonumber(drop_interval))) + aggr.suppress = 0 + end + end + reset_rate_entry(result, flow_entry, timestamp) + else + aggr.flow_count = aggr.flow_count + 1 + aggr.packets = aggr.packets + + flow_entry.value.packetDeltaCount + aggr.octets = aggr.octets + + flow_entry.value.octetDeltaCount + end + if config.drop and aggr.suppress == 1 then + -- NB: this rate-limiter applies to flows from *all* + -- aggregates, while the threshold rate applies to each + -- aggregate individually. + if self.sp.export_rate_tb:take(1) then + aggr.exports = aggr.exports + 1 + return false + else + aggr.drops = aggr.drops + 1 + return true + end + end + else + ffi.fill(entry.value, ffi.sizeof(entry.value)) + reset_rate_entry(entry, flow_entry, timestamp) + self.sp.table:add(entry.key, entry.value) + end + return false +end + -- Walk through flow set to see if flow records need to be expired. -- Collect expired records and export them to the collector. function FlowSet:expire_records(out, now) - -- For a breath time of 100us, we will get 1e4 calls to push() every - -- second. We'd like to sweep through the flow table once every 10 - -- seconds, so on each breath we process 1e-5th of the table. local cursor = self.expiry_cursor - local limit = cursor + math.ceil(self.table.size * 1e-5) - now = to_milliseconds(now) + now_ms = to_milliseconds(now) local active = to_milliseconds(self.active_timeout) local idle = to_milliseconds(self.idle_timeout) - while true do + for i = 1, self.table_tb:take_burst() do local entry - cursor, entry = self.table:next_entry(cursor, limit) - if not entry then break end - if now - tonumber(entry.value.flowEndMilliseconds) > idle then - self:debug_flow(entry, "expire idle") - -- Relying on key and value being contiguous. - self:add_data_record(entry.key, out) - self.table:remove(entry.key) - elseif now - tonumber(entry.value.flowStartMilliseconds) > active then - self:debug_flow(entry, "expire active") - -- TODO: what should timers reset to? - entry.value.flowStartMilliseconds = now - entry.value.flowEndMilliseconds = now - entry.value.packetDeltaCount = 0 - entry.value.octetDeltaCount = 0 - self:add_data_record(entry.key, out) - cursor = cursor + 1 + cursor, entry = self.table:next_entry(cursor, cursor + 1) + if entry then + if now_ms - tonumber(entry.value.flowEndMilliseconds) > idle then + self:debug_flow(entry, "expire idle") + if (not self:suppress_flow(entry, now_ms) and + entry.value.packetDeltaCount > 0) then + -- Relying on key and value being contiguous. + self:add_data_record(entry.key, out) + end + self.table:remove_ptr(entry) + elseif now_ms - tonumber(entry.value.flowStartMilliseconds) > active then + self:debug_flow(entry, "expire active") + if (not self:suppress_flow(entry, now_ms) and + entry.value.packetDeltaCount > 0) then + self:add_data_record(entry.key, out) + end + entry.value.flowStartMilliseconds = now_ms + entry.value.flowEndMilliseconds = now_ms + entry.value.packetDeltaCount = 0 + entry.value.octetDeltaCount = 0 + cursor = cursor + 1 + else + -- Flow still live. + cursor = cursor + 1 + end else - -- Flow still live. - cursor = cursor + 1 + -- Empty slot or end of table + if cursor == 0 then + self.table_scan_time = now - self.table_tstamp + self.table_tstamp = now + end end end self.expiry_cursor = cursor - self:flush_data_records(out) + if self.flush_timer() then self:flush_data_records(out) end +end + +function FlowSet:sync_stats() + counter.set(self.shm.table_size, self.table.size) + counter.set(self.shm.table_byte_size, self.table.byte_size) + counter.set(self.shm.table_occupancy, self.table.occupancy) + counter.set(self.shm.table_max_displacement, self.table.max_displacement) + counter.set(self.shm.table_scan_time, self.table_scan_time) + counter.set(self.shm.rate_table_size, self.sp.table.size or 0) + counter.set(self.shm.rate_table_byte_size, self.sp.table.byte_size or 0) + counter.set(self.shm.rate_table_occupancy, self.sp.table.occupancy or 0) + counter.set(self.shm.rate_table_max_displacement, self.sp.table.max_displacement or 0) + if self.shm_template then + for _, name in ipairs(self.template.counters_names) do + counter.set(self.shm_template[name], self.template.counters[name]) + end + end end IPFIX = {} local ipfix_config_params = { idle_timeout = { default = 300 }, active_timeout = { default = 120 }, + flush_timeout = { default = 10 }, cache_size = { default = 20000 }, + max_load_factor = { default = 0.4 }, + scan_protection = { default = {} }, + scan_time = { default = 10 }, -- RFC 5153 §6.2 recommends a 10-minute template refresh -- configurable from 1 minute to 1 day. template_refresh_interval = { default = 600 }, @@ -299,23 +582,89 @@ local ipfix_config_params = { mtu = { default = 512 }, observation_domain = { default = 256 }, exporter_ip = { required = true }, + exporter_eth_src = { default = '00:00:00:00:00:00' }, + exporter_eth_dst = { default = '00:00:00:00:00:00' }, collector_ip = { required = true }, collector_port = { required = true }, - templates = { default = { template.v4, template.v6 } } + templates = { default = { "v4", "v6" } }, + maps = { default = {} }, + maps_log_fh = { default = nil }, + -- Used to distinguish instances of the app running in the same + -- process + instance = { default = 1 }, + add_packet_metadata = { default = true }, + log_date = { default = true } +} + +local scan_protection_params = { + enable = { default = false }, + drop = { default = true }, + aggregate_v4 = { default = 24 }, + aggregate_v6 = { default = 64 }, + cache_size = { default = 20000 }, + max_load_factor = { default = 0.6 }, + interval = { default = 300 }, + report_interval = { default = 43200 }, + threshold_rate = { default = 10000 }, + export_rate = { default = 500 }, + max_bytes_per_packet = { default = 90 }, + max_packets_per_flow = { default = 2 } } +local function setup_transport_header(self, config) + -- Prepare transport headers to prepend to each export packet + -- TODO: Support IPv6. + local eth_h = ether:new({ src = ether:pton(config.exporter_eth_src), + dst = ether:pton(config.exporter_eth_dst), + type = 0x0800 }) + local ip_h = ipv4:new({ src = ipv4:pton(config.exporter_ip), + dst = ipv4:pton(config.collector_ip), + protocol = 17, + ttl = 64 }) + local udp_h = udp:new({ src_port = math.random(49152, 65535), + dst_port = config.collector_port }) + local transport_headers = datagram:new(packet.allocate()) + transport_headers:push(udp_h) + transport_headers:push(ip_h) + transport_headers:push(eth_h) + -- We need to update the IP and UDP headers after adding a payload. + -- The following re-locates ip_h and udp_h to point to the headers + -- in the template packet. + transport_headers:new(transport_headers:packet(), ether) -- Reset the parse stack + transport_headers:parse_n(3) + _, ip_h, udp_h = unpack(transport_headers:stack()) + self.transport_headers = { + ip_h = ip_h, + udp_h = udp_h, + pkt = transport_headers:packet() + } +end + function IPFIX:new(config) config = lib.parse(config, ipfix_config_params) - local o = { sequence_number = 1, - boot_time = engine.now(), + local o = { boot_time = engine.now(), template_refresh_interval = config.template_refresh_interval, next_template_refresh = -1, version = config.ipfix_version, observation_domain = config.observation_domain, - exporter_ip = config.exporter_ip, - exporter_port = math.random(49152, 65535), - collector_ip = config.collector_ip, - collector_port = config.collector_port } + instance = config.instance, + add_packet_metadata = config.add_packet_metadata, + logger = logger.new({ date = config.log_date, + module = ("[%5d]"):format(S.getpid()) + .." IPFIX exporter"} ) } + o.shm = { + -- Total number of packets received + received_packets = { counter }, + -- Packets not matched by any flow set + ignored_packets = { counter }, + -- Number of template packets sent + template_packets = { counter }, + -- Non-wrapping sequence number (see add_ipfix_header() for a + -- brief description of the semantics for IPFIX and Netflowv9) + sequence_number = { counter, 1 }, + version = { counter, o.version }, + observation_domain = { counter, o.observation_domain }, + } if o.version == 9 then o.header_t = netflow_v9_packet_header_t @@ -327,6 +676,8 @@ function IPFIX:new(config) o.header_ptr_t = ptr_to(o.header_t) o.header_size = ffi.sizeof(o.header_t) + setup_transport_header(o, config) + -- FIXME: Assuming we export to IPv4 address. local l3_header_len = 20 local l4_header_len = 8 @@ -335,25 +686,51 @@ function IPFIX:new(config) local flow_set_args = { mtu = config.mtu - total_header_len, version = config.ipfix_version, cache_size = config.cache_size, + max_load_factor = config.max_load_factor, + scan_protection = lib.parse(config.scan_protection, + scan_protection_params), idle_timeout = config.idle_timeout, - active_timeout = config.active_timeout } + active_timeout = config.active_timeout, + scan_time = config.scan_time, + flush_timeout = config.flush_timeout, + parent = o, + maps = config.maps, + maps_log_fh = config.maps_log_fh, + instance = config.instance, + log_date = config.log_date } o.flow_sets = {} for _, template in ipairs(config.templates) do table.insert(o.flow_sets, FlowSet:new(template, flow_set_args)) + o.logger:log("Added template "..o.flow_sets[#o.flow_sets]:id()) end - self.outgoing_link_name, self.outgoing = new_internal_link('IPFIX outgoing') - + o.stats_timer = lib.throttle(5) return setmetatable(o, { __index = self }) end +function IPFIX:reconfig(config) + -- Only support reconfiguration of the transport header for now + config = lib.parse(config, ipfix_config_params) + setup_transport_header(self, config) +end + function IPFIX:send_template_records(out) local pkt = packet.allocate() for _, flow_set in ipairs(self.flow_sets) do pkt = flow_set:append_template_record(pkt) end - add_record_count(pkt, #self.flow_sets) + local record_count + if self.version == 9 then + record_count = #self.flow_sets + else + -- For IPFIX, template records are not accounted for in the + -- sequence number of the header + record_count = 0 + end + pkt = self:add_ipfix_header(pkt, record_count) + pkt = self:add_transport_headers(pkt) + counter.add(self.shm.template_packets) link.transmit(out, pkt) end @@ -362,82 +739,99 @@ function IPFIX:add_ipfix_header(pkt, count) local header = ffi.cast(self.header_ptr_t, pkt.data) header.version = htons(self.version) + header.sequence_number = htonl(tonumber(counter.read(self.shm.sequence_number))) if self.version == 9 then + -- record_count counts the number of all records in this packet + -- (template and data) header.record_count = htons(count) + -- sequence_number counts the number of exported packets + conter.add(self.shm.sequence_number) header.uptime = htonl(to_milliseconds(engine.now() - self.boot_time)) elseif self.version == 10 then + -- sequence_number counts the cumulative number of data records + -- (i.e. excluding template and option records) + counter.add(self.shm.sequence_number, count) header.byte_length = htons(pkt.length) end header.timestamp = htonl(math.floor(C.get_unix_time())) - header.sequence_number = htonl(self.sequence_number) header.observation_domain = htonl(self.observation_domain) - self.sequence_number = self.sequence_number + 1 - return pkt end function IPFIX:add_transport_headers (pkt) - -- TODO: Support IPv6. - local eth_h = ether:new({ src = ether:pton('00:00:00:00:00:00'), - dst = ether:pton('00:00:00:00:00:00'), - type = 0x0800 }) - local ip_h = ipv4:new({ src = ipv4:pton(self.exporter_ip), - dst = ipv4:pton(self.collector_ip), - protocol = 17, - ttl = 64, - flags = 0x02 }) - local udp_h = udp:new({ src_port = self.exporter_port, - dst_port = self.collector_port }) - + local headers = self.transport_headers + local ip_h, udp_h = headers.ip_h, headers.udp_h udp_h:length(udp_h:sizeof() + pkt.length) udp_h:checksum(pkt.data, pkt.length, ip_h) ip_h:total_length(ip_h:sizeof() + udp_h:sizeof() + pkt.length) ip_h:checksum() + return packet.prepend(pkt, headers.pkt.data, headers.pkt.length) +end - local dgram = datagram:new(pkt) - dgram:push(udp_h) - dgram:push(ip_h) - dgram:push(eth_h) - return dgram:packet() +function IPFIX:push () + for _, input in ipairs(self.input) do + self:push1(input) + end end -function IPFIX:push() - local input = self.input.input +function IPFIX:push1(input) -- FIXME: Use engine.now() for monotonic time. Have to check that -- engine.now() gives values relative to the UNIX epoch though. local timestamp = ffi.C.get_unix_time() - assert(self.output.output, "missing output link") - local outgoing = self.outgoing - if self.next_template_refresh < engine.now() then - self.next_template_refresh = engine.now() + self.template_refresh_interval - self:send_template_records(outgoing) + local flow_sets = self.flow_sets + local nreadable = link.nreadable(input) + counter.add(self.shm.received_packets, nreadable) + + if self.add_packet_metadata then + for _ = 1, nreadable do + local p = link.receive(input) + metadata_add(p) + link.transmit(input, p) + end end - local flow_sets = self.flow_sets - for i=1,link.nreadable(input) do - local pkt = link.receive(input) - local handled = false - for _,set in ipairs(flow_sets) do - if set.match(pkt.data, pkt.length) then - link.transmit(set.incoming, pkt) - handled = true - break + for _,set in ipairs(flow_sets) do + for _ = 1, nreadable do + local p = link.receive(input) + local md = metadata_get(p) + if set.match(md.filter_start, md.filter_length) then + link.transmit(set.incoming, p) + else + link.transmit(input, p) end end - -- Drop packet if it didn't match any flow set. - if not handled then packet.free(pkt) end + nreadable = link.nreadable(input) + end + + counter.add(self.shm.ignored_packets, nreadable) + for _ = 1, nreadable do + packet.free(link.receive(input)) end for _,set in ipairs(flow_sets) do set:record_flows(timestamp) end - for _,set in ipairs(flow_sets) do set:expire_records(outgoing, timestamp) end - for i=1,link.nreadable(outgoing) do - local pkt = link.receive(outgoing) - pkt = self:add_ipfix_header(pkt, remove_record_count(pkt)) - pkt = self:add_transport_headers(pkt) - link.transmit(self.output.output, pkt) +end + +function IPFIX:tick() + local timestamp = ffi.C.get_unix_time() + assert(self.output.output, "missing output link") + local output = self.output.output + for _,set in ipairs(self.flow_sets) do + set:expire_records(output, timestamp) + set:expire_flow_rate_records(timestamp) + end + + if self.next_template_refresh < engine.now() then + self.next_template_refresh = engine.now() + self.template_refresh_interval + self:send_template_records(self.output.output) + end + + if self.stats_timer() then + for _,set in ipairs(self.flow_sets) do + set:sync_stats() + end end end @@ -448,23 +842,41 @@ function selftest() local ethertype_ipv6 = consts.ethertype_ipv6 local ipfix = IPFIX:new({ exporter_ip = "192.168.1.2", collector_ip = "192.168.1.1", - collector_port = 4739 }) + collector_port = 4739, + flush_timeout = 0, + scan_time = 1, + templates = { + 'v4_extended', 'v6_extended' + }, + maps = { + mac_to_as = "apps/ipfix/test/mac_to_as", + vlan_to_ifindex = "apps/ipfix/test/vlan_to_ifindex", + pfx4_to_as = "apps/ipfix/test/pfx4_to_as.csv", + pfx6_to_as = "apps/ipfix/test/pfx6_to_as.csv" + }}) + ipfix.shm = shm.create_frame("apps/ipfix", ipfix.shm) -- Mock input and output. local input_name, input = new_internal_link('ipfix selftest input') local output_name, output = new_internal_link('ipfix selftest output') - ipfix.input, ipfix.output = { input = input }, { output = output } + ipfix.input, ipfix.output = { [1] = input, input = input }, { [1] = output, output = output } local ipv4_flows, ipv6_flows = unpack(ipfix.flow_sets) -- Test helper that supplies a packet with some given fields. - local function test(src_ip, dst_ip, src_port, dst_port) + local function test(src_ip, dst_ip, src_port, dst_port, vlan_id) local is_ipv6 = not not src_ip:match(':') local proto = is_ipv6 and ethertype_ipv6 or ethertype_ipv4 local eth = ether:new({ src = ether:pton("00:11:22:33:44:55"), - dst = ether:pton("55:44:33:22:11:00"), + dst = ether:pton("50:44:33:22:11:00"), type = proto }) + local vlan local ip + if vlan_id then + eth:type(dot1q.TPID) + vlan = dot1q:new{ id = vlan_id, type = proto } + end + if is_ipv6 then ip = ipv6:new({ src = ipv6:pton(src_ip), dst = ipv6:pton(dst_ip), next_header = IP_PROTO_UDP, ttl = 64 }) @@ -477,6 +889,7 @@ function selftest() dg:push(udp) dg:push(ip) + if vlan then dg:push(vlan) end dg:push(eth) link.transmit(input, dg:packet()) @@ -484,11 +897,12 @@ function selftest() end -- Populate with some known flows. - test("192.168.1.1", "192.168.1.25", 9999, 80) + test("192.168.1.1", "192.168.1.25", 9999, 80, 1) test("192.168.1.25", "192.168.1.1", 3653, 23552) test("192.168.1.25", "8.8.8.8", 58342, 53) test("8.8.8.8", "192.168.1.25", 53, 58342) test("2001:4860:4860::8888", "2001:db8::ff00:42:8329", 53, 57777) + ipfix:tick() assert(ipv4_flows.table.occupancy == 4, string.format("wrong number of v4 flows: %d", ipv4_flows.table.occupancy)) assert(ipv6_flows.table.occupancy == 1, @@ -512,6 +926,12 @@ function selftest() local result = ipv4_flows.table:lookup_ptr(key) assert(result, "key not found") assert(result.value.packetDeltaCount == 1) + assert(result.value.bgpSourceAsNumber == 1234) + assert(result.value.bgpDestinationAsNumber == 5678) + assert(result.value.ingressInterface == 2) + assert(result.value.egressInterface == 3) + assert(result.value.bgpPrevAdjacentAsNumber == 321) + assert(result.value.bgpNextAdjacentAsNumber == 654) -- make sure the count is incremented on the same flow test("192.168.1.1", "192.168.1.25", 9999, 80) @@ -529,6 +949,8 @@ function selftest() local result = ipv6_flows.table:lookup_ptr(key) assert(result, "key not found") assert(result.value.packetDeltaCount == 1) + assert(result.value.bgpSourceAsNumber == 1234) + assert(result.value.bgpDestinationAsNumber == 5678) -- sanity check ipv4_flows.table:selfcheck() @@ -552,10 +974,12 @@ function selftest() -- Template message; no data yet. assert(link.nreadable(output) == 1) - -- Cause expiry. By default we do 1e-5th of the table per push, - -- so this should be good. - for i=1,2e5 do ipfix:push() end - -- Template message and data message. + -- Wait for a full scan of the table to complete (1 second, + -- "scan_time") + local now = engine.now() + while engine.now() - now < 1 do + ipfix:tick() + end assert(link.nreadable(output) == 2) local filter = require("pf").compile_filter([[ diff --git a/src/apps/ipfix/maps.lua b/src/apps/ipfix/maps.lua new file mode 100644 index 0000000000..50c17731a2 --- /dev/null +++ b/src/apps/ipfix/maps.lua @@ -0,0 +1,140 @@ +module(..., package.seeall) + +local ffi = require("ffi") +local lib = require("core.lib") +local ctable = require("lib.ctable") +local ethernet = require("lib.protocol.ethernet") +local ipv4 = require("lib.protocol.ipv4") +local ipv6 = require("lib.protocol.ipv6") +local poptrie = require("lib.poptrie") +local logger = require("lib.logger") + +-- Map MAC addresses to peer AS number +-- +-- Used to determine bgpPrevAdjacentAsNumber, bgpNextAdjacentAsNumber +-- from the packet's MAC addresses. File format: +-- - +local mac_to_as_key_t = ffi.typeof("uint8_t[6]") +local mac_to_as_value_t = ffi.typeof("uint32_t") + +local function make_mac_to_as_map(name) + local table = ctable.new({ key_type = mac_to_as_key_t, + value_type = mac_to_as_value_t, + initial_size = 15000, + max_displacement_limit = 30 }) + local key = mac_to_as_key_t() + local value = mac_to_as_value_t() + for line in assert(io.lines(name)) do + local as, mac = line:match("^%s*(%d*)-([0-9a-fA-F:]*)") + assert(as and mac, "MAC-to-AS map: invalid line: "..line) + local key, value = ethernet:pton(mac), tonumber(as) + local result = table:lookup_ptr(key) + if result then + if result.value ~= value then + print("MAC-to-AS map: amibguous mapping: " + ..ethernet:ntop(key)..": "..result.value..", "..value) + end + end + table:add(key, value, true) + end + return table +end + +-- Map VLAN tag to interface Index +-- +-- Used to set ingressInterface, egressInterface based on the VLAN +-- tag. This is useful if packets from multiple sources are +-- multiplexed on the input interface by a device between the metering +-- process and the port mirrors/optical taps of the monitored links. +-- The multiplexer adds a VLAN tag to uniquely identify the original +-- monitored link. The tag is then translated into an interface +-- index. Only one of the ingressInterface and egressInterface +-- elements is relevant, depending on the direction of the flow. File +-- format: +-- -- +local function make_vlan_to_ifindex_map(name) + local table = {} + for line in assert(io.lines(name)) do + local vlan, ingress, egress = line:match("^(%d+)-(%d+)-(%d+)$") + assert(vlan and ingress and egress, + "VLAN-to-IFIndex map: invalid line: "..line) + table[tonumber(vlan)] = { + ingress = tonumber(ingress), + egress = tonumber(egress) + } + end + return table +end + +-- Map IP address to AS number +-- +-- Used to set bgpSourceAsNumber, bgpDestinationAsNumber from the IP +-- source and destination address, respectively. The file contains a +-- list of prefixes and their proper source AS number based on +-- authoritative data from the RIRs. This parser supports the format +-- used by the Geo2Lite database provided by MaxMind: +-- http://geolite.maxmind.com/download/geoip/database/GeoLite2-ASN-CSV.zip +local function make_pfx_to_as_map(name, proto) + local table = { pt = poptrie.new{direct_pointing=true, + leaf_t=ffi.typeof("uint32_t")} } + if proto == ipv4 then + function table:search_bytes (a) + return self.pt:lookup32(a) + end + elseif proto == ipv6 then + function table:search_bytes (a) + return self.pt:lookup128(a) + end + else + error("Proto must be ipv4 or ipv6") + end + for line in assert(io.lines(name)) do + if not line:match("^network") then + local cidr, asn = line:match("([^,]*),(%d+),") + asn = tonumber(asn) + assert(cidr and asn, "Prefix-to-AS map: invalid line: "..line) + assert(asn > 0 and asn < 2^32, "Prefix-to-AS map: asn out of range: "..asn) + local pfx, len = proto:pton_cidr(cidr) + table.pt:add(pfx, len, asn) + end + end + table.pt:build() + return table +end + +local map_info = { + mac_to_as = { + create_fn = make_mac_to_as_map, + logger_module = 'MAC to AS mapper' + }, + vlan_to_ifindex = { + create_fn = make_vlan_to_ifindex_map, + logger_module = 'VLAN to ifIndex mapper' + }, + pfx4_to_as = { + create_fn = function (name) return make_pfx_to_as_map(name, ipv4) end, + logger_module = 'IPv4 prefix to AS mapper' + }, + pfx6_to_as = { + create_fn = function (name) return make_pfx_to_as_map(name, ipv6) end, + logger_module = 'IPv6 prefix to AS mapper' + } +} + +local maps = {} + +function mk_map(name, file, log_rate, log_fh) + local info = assert(map_info[name]) + local map = maps[name] + if not map then + map = info.create_fn(file) + maps[name] = map + end + local map = { map = map } + if log_fh then + map.logger = logger.new({ rate = log_rate or 0.05, + fh = log_fh, + module = info.logger_module }) + end + return map +end diff --git a/src/apps/ipfix/strings.lua b/src/apps/ipfix/strings.lua new file mode 100644 index 0000000000..90d2947a1e --- /dev/null +++ b/src/apps/ipfix/strings.lua @@ -0,0 +1,86 @@ +module(..., package.seeall) + +local ffi = require("ffi") + +ct_t = ffi.typeof([[ + struct { + uint8_t *text; + uint16_t length; + uint16_t pos; + } +]]) + +function ct_set(ct, pos) + ct.pos = pos +end + +function ct_get(ct) + return ct.pos +end + +function ct_at(ct) + return ct.text + ct.pos +end + +function ct_init(ct, text, length, pos) + ct.text = text + ct.length = length + ct.pos = pos or 0 +end + +function search(string, ct, tail) + local slen = string.len + local pos = ct.pos + while (pos + slen < ct.length) do + if ffi.C.strncasecmp(string.buf, ct.text + pos, slen) == 0 then + if tail then pos = pos + slen end + ct.pos = pos + return pos + end + pos = pos + 1 + end + return nil +end + +function upto_space_or_cr(ct) + local text = ct.text + local pos = ct.pos + local pos_start = pos + while (pos < ct.length and text[pos] ~= 32 and text[pos] ~= 13) do + pos = pos + 1 + end + ct.pos = pos + return pos, pos - pos_start +end + +function skip_space(ct) + local text = ct.text + local pos = ct.pos + local pos_start = pos + while (pos < ct.length and text[pos] == 32) do + pos = pos + 1 + end + ct.pos = pos + return pos, pos - pos_start +end + +function string_to_buf(s) + -- Using ffi.new("uint8_t[?]", #s) results in trace aborts due to + -- "bad argument type" in ffi.sizeof() + local buf = ffi.new("uint8_t["..#s.."]") + for i = 1, #s do + buf[i-1] = s:byte(i,i) + end + return buf +end + +function strings_to_buf(t) + local result = {} + for k, v in pairs(t) do + result[k] = { + buf = string_to_buf(v), + len = #v + } + end + return result +end diff --git a/src/apps/ipfix/template.lua b/src/apps/ipfix/template.lua index be649ec7d3..22f942ca2d 100644 --- a/src/apps/ipfix/template.lua +++ b/src/apps/ipfix/template.lua @@ -3,41 +3,58 @@ module(..., package.seeall) -local bit = require("bit") -local ffi = require("ffi") -local pf = require("pf") -local consts = require("apps.lwaftr.constants") -local lib = require("core.lib") +local bit = require("bit") +local ffi = require("ffi") +local pf = require("pf") +local consts = require("apps.lwaftr.constants") +local lib = require("core.lib") +local counter = require("core.counter") +local ethernet = require("lib.protocol.ethernet") +local ipv4 = require("lib.protocol.ipv4") +local ipv6 = require("lib.protocol.ipv6") +local metadata = require("apps.rss.metadata") +local strings = require("apps.ipfix.strings") +local dns = require("apps.ipfix.dns") +local S = require("syscall") local ntohs = lib.ntohs local htonl, htons = lib.htonl, lib.htons local function htonq(v) return bit.bswap(v + 0ULL) end +local metadata_get = metadata.get +local ether_header_ptr_t = metadata.ether_header_ptr_t local function ptr_to(ctype) return ffi.typeof('$*', ctype) end local debug = lib.getenv("FLOW_EXPORT_DEBUG") +local IP_PROTO_ICMP = 1 local IP_PROTO_TCP = 6 local IP_PROTO_UDP = 17 +local IP_PROTO_ICMP6 = 58 local IP_PROTO_SCTP = 132 +-- A protocol's predicate whether it is a transport protocol is +-- encoded in this table to implement a check as a single table lookup +-- instead of a conditional with logical operators that can lead to +-- multiple levels of side-traces +local transport_proto_p = { + [IP_PROTO_TCP] = true, + [IP_PROTO_UDP] = true, + [IP_PROTO_SCTP] = true +} -- These constants are taken from the lwaftr constants module, which -- is maybe a bad dependency but sharing code is good -- TODO: move constants somewhere else? lib? -local ethertype_ipv4 = consts.ethertype_ipv4 -local ethertype_ipv6 = consts.ethertype_ipv6 -local ethernet_header_size = consts.ethernet_header_size -local ipv6_fixed_header_size = consts.ipv6_fixed_header_size -local o_ethernet_ethertype = consts.o_ethernet_ethertype -local o_ipv4_total_length = consts.o_ipv4_total_length -local o_ipv4_ver_and_ihl = consts.o_ipv4_ver_and_ihl +local o_ipv4_dscp_and_ecn = consts.o_ipv4_dscp_and_ecn local o_ipv4_proto = consts.o_ipv4_proto local o_ipv4_src_addr = consts.o_ipv4_src_addr local o_ipv4_dst_addr = consts.o_ipv4_dst_addr -local o_ipv6_payload_len = consts.o_ipv6_payload_len -local o_ipv6_next_header = consts.o_ipv6_next_header +local o_icmpv4_msg_type = consts.o_icmpv4_msg_type +local o_icmpv4_msg_code = consts.o_icmpv4_msg_code local o_ipv6_src_addr = consts.o_ipv6_src_addr local o_ipv6_dst_addr = consts.o_ipv6_dst_addr +local o_icmpv6_msg_type = consts.o_icmpv6_msg_type +local o_icmpv6_msg_code = consts.o_icmpv6_msg_code local function string_parser(str) local idx = 1 @@ -46,7 +63,7 @@ local function string_parser(str) function ret.consume_upto(char) local start_idx = idx local byte = char:byte() - while str:byte(idx) ~= byte do + while str:byte(idx) ~= byte and idx <= str:len() do if str:byte(idx) == quote then idx = idx + 1 while str:byte(idx) ~= quote do idx = idx + 1 end @@ -61,38 +78,117 @@ local function string_parser(str) end -- Parse out available IPFIX fields. -local function make_ipfix_element_map() - local elems = require("apps.ipfix.ipfix_information_elements_inc") - local parser = string_parser(elems) +local function make_ipfix_element_map(names) local map = {} - while not parser.is_done() do - local id = parser.consume_upto(",") - local name = parser.consume_upto(",") - local data_type = parser.consume_upto(",") - for i=1,8 do parser.consume_upto(",") end - parser.consume_upto("\n") - map[name] = { id = id, data_type = data_type } + for _, name in ipairs(names) do + local elems = require("apps.ipfix."..name.."_inc") + local parser = string_parser(elems) + while not parser.is_done() do + local id = parser.consume_upto(",") + local name = parser.consume_upto(",") + local data_type = parser.consume_upto(",") + for i=1,8 do parser.consume_upto(",") end + parser.consume_upto("\n") + map[name] = { id = id, data_type = data_type } + local pen, id = id:match("(%d+):(%d+)") + if pen then + -- Private Enterprise Number + map[name].id = tonumber(bit.bor(id, 0x8000)) + map[name].pen = pen + end + end end return map end -local ipfix_elements = make_ipfix_element_map() +local ipfix_elements = + make_ipfix_element_map({ 'ipfix_information_elements', + 'ipfix_information_elements_local' }) local swap_fn_env = { htons = htons, htonl = htonl, htonq = htonq } +local aggregate_info = { + v4 = { + key_type = ffi.typeof([[ + struct { + uint32_t addr; + } __attribute__((packed)) + ]]), + mk_fns = function(plen_v4, plen_v6) + local plen = plen_v4 + local mask = 0 + if plen > 0 then + mask = bit.bnot(bit.lshift(1, 32-plen) - 1) + end + return + function (flow_key, rate_key) + rate_key.addr = htonl( + bit.band( + htonl(ffi.cast("uint32_t *", + flow_key.sourceIPv4Address)[0]), + mask)) + end, + function(rate_key) + return ipv4:ntop(ffi.cast("uint8_t*", rate_key)).."/"..plen + end + end + }, + v6 = { + key_type = ffi.typeof([[ + struct { + uint64_t addr[2]; + } __attribute__((packed)) + ]]), + mk_fns = function(plen_v4, plen_v6) + local plen = plen_v6 + + local function plen2mask(plen) + local mask = 0ULL + if plen > 0 then + mask = bit.bnot(bit.lshift(1ULL, 64-plen) - 1) + end + return mask + end + + mask_low = plen2mask(plen > 64 and plen - 64 or 0) + mask_high = plen2mask(plen >= 64 and 64 or plen) + return + function (flow_key, rate_key) + local addr = ffi.cast("uint64_t *", flow_key.sourceIPv6Address) + rate_key.addr[0] = htonq( + bit.band( + htonq(ffi.cast("uint64_t *", + flow_key.sourceIPv6Address)[0]), + mask_high)) + rate_key.addr[1] = htonq( + bit.band( + htonq(ffi.cast("uint64_t *", + flow_key.sourceIPv6Address)[1]), + mask_low)) + end, + function(rate_key) + return ipv6:ntop(ffi.cast("uint8_t*", rate_key)).."/"..plen + end + end + } +} + -- Create a table describing the information needed to create -- flow templates and data records. -local function make_template_info(spec) +function make_template_info(spec) -- Representations of IPFIX IEs. local ctypes = { unsigned8 = 'uint8_t', unsigned16 = 'uint16_t', unsigned32 = 'uint32_t', unsigned64 = 'uint64_t', + string = 'uint8_t[?]', octetArray = 'uint8_t[?]', ipv4Address = 'uint8_t[4]', ipv6Address = 'uint8_t[16]', - dateTimeMilliseconds = 'uint64_t' } + macAddress = 'uint8_t[6]', dateTimeMilliseconds = 'uint64_t' } local bswap = { uint16_t='htons', uint32_t='htonl', uint64_t='htonq' } - -- the contents of the template records we will send - -- there is an ID & length for each field - local length = 2 * (#spec.keys + #spec.values) + -- The contents of the template records we will send. There is an + -- ID & length (2 bytes each) for each field as well as possibly a + -- PEN (4 bytes). We pre-allocate a buffer of the maximum possible + -- size. + local length = 4 * (#spec.keys + #spec.values) local buffer = ffi.new("uint16_t[?]", length) -- octets in a data record @@ -100,31 +196,58 @@ local function make_template_info(spec) local swap_fn = {} local function process_fields(buffer, fields, struct_def, types, swap_tmpl) - for idx, name in ipairs(fields) do + local idx = 0 + for _, name in ipairs(fields) do + local _name, size = name:match("(%w+)=(%d+)") + if _name then + name = _name + end local entry = ipfix_elements[name] local ctype = assert(ctypes[entry.data_type], - 'unimplemented: '..entry.data_type) + name..': unimplemented data type ' + ..entry.data_type) + if size then + size = tonumber(size) + assert(entry.data_type == 'string' or entry.data_type == 'octetArray', + name..': length parameter given for fixed-length data type ' + ..entry.data_type) + ctype = ctype:gsub('%?', size) + else + assert(entry.data_type ~= 'string' and entry.data_type ~= 'octetArray', + name..': length parameter required for data type ' + ..entry.data_type) + end data_len = data_len + ffi.sizeof(ctype) - buffer[2 * (idx - 1)] = htons(entry.id) - buffer[2 * (idx - 1) + 1] = htons(ffi.sizeof(ctype)) + buffer[idx] = htons(entry.id) + buffer[idx + 1] = htons(ffi.sizeof(ctype)) + idx = idx + 2 + if entry.pen then + ffi.cast("uint32_t*", buffer + idx)[0] = htonl(entry.pen) + idx = idx + 2 + end table.insert(struct_def, '$ '..name..';') table.insert(types, ffi.typeof(ctype)) if bswap[ctype] then table.insert(swap_fn, swap_tmpl:format(name, bswap[ctype], name)) end end + return idx end table.insert(swap_fn, 'return function(o)') local key_struct_def = { 'struct {' } local key_types = {} - process_fields(buffer, spec.keys, key_struct_def, key_types, - 'o.key.%s = %s(o.key.%s)') + local length = process_fields(buffer, spec.keys, key_struct_def, key_types, + 'o.key.%s = %s(o.key.%s)') table.insert(key_struct_def, '} __attribute__((packed))') local value_struct_def = { 'struct {' } local value_types = {} - process_fields(buffer + #spec.keys * 2, spec.values, value_struct_def, - value_types, 'o.value.%s = %s(o.value.%s)') + length = length + process_fields(buffer + length, spec.values, value_struct_def, + value_types, 'o.value.%s = %s(o.value.%s)') + if spec.state_t then + table.insert(value_struct_def, "$ state;") + table.insert(value_types, spec.state_t) + end table.insert(value_struct_def, '} __attribute__((packed))') table.insert(swap_fn, 'end') local key_t = ffi.typeof(table.concat(key_struct_def, ' '), @@ -136,8 +259,16 @@ local function make_template_info(spec) gen_swap_fn = loadstring(table.concat(swap_fn, '\n')) setfenv(gen_swap_fn, swap_fn_env) - assert(ffi.sizeof(record_t) == data_len) + -- State data, if present, is part of the value but must not be + -- included in export records. + assert(ffi.sizeof(record_t) - ffi.sizeof(spec.state_t or 'char [0]') == data_len) + local counters_names = {} + if spec.counters then + for name, _ in pairs(spec.counters) do + table.insert(counters_names, name) + end + end return { id = spec.id, field_count = #spec.keys + #spec.values, buffer = buffer, @@ -148,18 +279,29 @@ local function make_template_info(spec) record_t = record_t, record_ptr_t = ptr_to(record_t), swap_fn = gen_swap_fn(), - match = pf.compile_filter(spec.filter) + match = pf.compile_filter(spec.filter), + counters = spec.counters, + counters_names = counters_names, + extract = spec.extract, + accumulate = spec.accumulate, + require_maps = spec.require_maps or {}, + aggregate_info = aggregate_info[spec.aggregation_type] } end local uint16_ptr_t = ffi.typeof('uint16_t *') -local function get_ipv4_ihl(l3) - return bit.band((l3 + o_ipv4_ver_and_ihl)[0], 0x0f) +local function get_ipv4_tos(l3) return l3[o_ipv4_dscp_and_ecn] end +local function get_ipv6_tc(l3) + -- Version, traffic class and first part of flow label + local v_tc_fl = ntohs(ffi.cast(uint16_ptr_t, l3)[0]) + -- Traffic class is bits 4-11 (MSB to LSB) + return (bit.rshift(bit.band(0x0FF0, v_tc_fl), 4)) end -local function get_ipv4_protocol(l3) return l3[o_ipv4_proto] end -local function get_ipv6_next_header(l3) return l3[o_ipv6_next_header] end +local function get_icmp_typecode(l4) + return ntohs(ffi.cast(uint16_ptr_t, l4+o_icmpv4_msg_type)[0]) +end local function get_ipv4_src_addr_ptr(l3) return l3 + o_ipv4_src_addr end local function get_ipv4_dst_addr_ptr(l3) return l3 + o_ipv4_dst_addr end @@ -180,135 +322,555 @@ local function read_ipv6_dst_address(l3, dst) ffi.copy(dst, get_ipv6_dst_addr_ptr(l3), 16) end -local function get_tcp_src_port(l4) +local function get_transport_src_port(l4) return ntohs(ffi.cast(uint16_ptr_t, l4)[0]) end -local function get_tcp_dst_port(l4) +local function get_transport_dst_port(l4) return ntohs(ffi.cast(uint16_ptr_t, l4)[1]) end -v4 = make_template_info { - id = 256, - filter = "ip", - keys = { "sourceIPv4Address", - "destinationIPv4Address", - "protocolIdentifier", - "sourceTransportPort", - "destinationTransportPort" }, - values = { "flowStartMilliseconds", - "flowEndMilliseconds", - "packetDeltaCount", - "octetDeltaCount"} -} +local function get_tcp_flags(l4) + return ntohs(ffi.cast(uint16_ptr_t, l4)[6]) +end -function v4.extract(pkt, timestamp, entry) - local l2 = pkt.data - local l3 = l2 + ethernet_header_size - local ihl = get_ipv4_ihl(l3) - local l4 = l3 + ihl * 4 +-- Address-family dependent extractors - -- Fill key. - -- FIXME: Try using normal Lua assignment. +local function extract_v4_addr(l3, entry) read_ipv4_src_address(l3, entry.key.sourceIPv4Address) read_ipv4_dst_address(l3, entry.key.destinationIPv4Address) - local prot = get_ipv4_protocol(l3) - entry.key.protocolIdentifier = prot - if prot == IP_PROTO_TCP or prot == IP_PROTO_UDP or prot == IP_PROTO_SCTP then - entry.key.sourceTransportPort = get_tcp_src_port(l4) - entry.key.destinationTransportPort = get_tcp_dst_port(l4) - else - entry.key.sourceTransportPort = 0 - entry.key.destinationTransportPort = 0 - end +end + +local function extract_v6_addr(l3, entry) + read_ipv6_src_address(l3, entry.key.sourceIPv6Address) + read_ipv6_dst_address(l3, entry.key.destinationIPv6Address) +end + +-- Address-family independent extract/accumulate functions + +local function extract_transport_key(l4, entry) + entry.key.sourceTransportPort = get_transport_src_port(l4) + entry.key.destinationTransportPort = get_transport_dst_port(l4) +end + +local function extract_tcp_flags(l4, entry) + -- Mask off data offset bits + entry.value.tcpControlBits = bit.band(0xFFF, get_tcp_flags(l4)) +end + +local function accumulate_tcp_flags(dst, new) + dst.value.tcpControlBits = bit.bor(dst.value.tcpControlBits, + new.value.tcpControlBits) +end + +local function extract_tcp_flags_reduced(l4, entry) + entry.value.tcpControlBitsReduced = bit.band(0xFF, get_tcp_flags(l4)) +end + +local function accumulate_tcp_flags_reduced(dst, new) + dst.value.tcpControlBitsReduced = + bit.bor(dst.value.tcpControlBitsReduced, + new.value.tcpControlBitsReduced) +end + +-- Clear key and value, extract the 3-tuple, fill in flow start/end +-- times and packet/octet counters. This is the bare minimum any +-- template will need. +local function extract_3_tuple(pkt, timestamp, entry, md, extract_addr_fn) + ffi.fill(entry.key, ffi.sizeof(entry.key)) + ffi.fill(entry.value, ffi.sizeof(entry.value)) + + extract_addr_fn(md.l3, entry) + entry.key.protocolIdentifier = md.proto - -- Fill value. entry.value.flowStartMilliseconds = timestamp entry.value.flowEndMilliseconds = timestamp entry.value.packetDeltaCount = 1 - -- Measure bytes starting with the IP header. - entry.value.octetDeltaCount = pkt.length - ethernet_header_size + entry.value.octetDeltaCount = md.total_length +end + +local function extract_5_tuple(pkt, timestamp, entry, md, extract_addr_fn) + extract_3_tuple(pkt, timestamp, entry, md, extract_addr_fn) + if transport_proto_p[md.proto] and md.frag_offset == 0 then + extract_transport_key(md.l4, entry) + end end -function v4.accumulate(dst, new) +local function accumulate_generic(dst, new) + -- If dst is a flow entry which has been cleared after an active + -- timeout and this is the first packet received since then, + -- flowStartMilliseconds is the time at which the flow was last + -- exported rather than the time at which the flow actually started + -- in the new active window. + if dst.value.packetDeltaCount == 0 then + dst.value.flowStartMilliseconds = new.value.flowStartMilliseconds + end dst.value.flowEndMilliseconds = new.value.flowEndMilliseconds dst.value.packetDeltaCount = dst.value.packetDeltaCount + 1 dst.value.octetDeltaCount = dst.value.octetDeltaCount + new.value.octetDeltaCount end -function v4.tostring(entry) - local ipv4 = require("lib.protocol.ipv4") - local key = entry.key - local protos = - { [IP_PROTO_TCP]='TCP', [IP_PROTO_UDP]='UDP', [IP_PROTO_SCTP]='SCTP' } - return string.format( - "%s (%d) -> %s (%d) [%s]", - ipv4:ntop(key.sourceIPv4Address), key.sourceTransportPort, - ipv4:ntop(key.destinationIPv4Address), key.destinationTransportPort, - protos[key.protocolIdentifier] or tostring(key.protocolIdentifier)) -end - -v6 = make_template_info { - id = 257, - filter = "ip6", - keys = { "sourceIPv6Address", - "destinationIPv6Address", - "protocolIdentifier", - "sourceTransportPort", - "destinationTransportPort" }, - values = { "flowStartMilliseconds", - "flowEndMilliseconds", - "packetDeltaCount", - "octetDeltaCount" } -} +local function v4_extract (self, pkt, timestamp, entry) + local md = metadata_get(pkt) + extract_5_tuple(pkt, timestamp, entry, md, extract_v4_addr) + if md.proto == IP_PROTO_TCP and md.frag_offset == 0 then + extract_tcp_flags_reduced(md.l4, entry) + end +end -function v6.extract(pkt, timestamp, entry) - local l2 = pkt.data - local l3 = l2 + ethernet_header_size - -- TODO: handle chained headers - local l4 = l3 + ipv6_fixed_header_size +local function v6_extract (self, pkt, timestamp, entry) + local md = metadata_get(pkt) + extract_5_tuple(pkt, timestamp, entry, md, extract_v6_addr) + if md.proto == IP_PROTO_TCP and md.frag_offset == 0 then + extract_tcp_flags_reduced(md.l4, entry) + end +end - -- Fill key. - -- FIXME: Try using normal Lua assignment. - read_ipv6_src_address(l3, entry.key.sourceIPv6Address) - read_ipv6_dst_address(l3, entry.key.destinationIPv6Address) - local prot = get_ipv6_next_header(l3) - entry.key.protocolIdentifier = prot - if prot == IP_PROTO_TCP or prot == IP_PROTO_UDP or prot == IP_PROTO_SCTP then - entry.key.sourceTransportPort = get_tcp_src_port(l4) - entry.key.destinationTransportPort = get_tcp_dst_port(l4) - else - entry.key.sourceTransportPort = 0 - entry.key.destinationTransportPort = 0 +--- Helper functions for HTTP templates + +-- We want to be able to find a "Host:" header even if it is not in +-- the same TCP segment as the GET request, which requires to keep +-- state. +local HTTP_state_t = ffi.typeof([[ + struct { + uint8_t have_GET; + uint8_t have_host; + uint8_t examined; + } __attribute__((packed)) +]]) +-- The number of TCP segments to scan for the first GET request +-- (including the SYN segment, which is skipped). Most requests are +-- found in the first non-handshake packet (segment #3 from the +-- client). Empirical evidence shows a strong peak there with a long +-- tail. A cutoff of 10 is expected to find at least 80% of the GET +-- requests. +local HTTP_scan_threshold = 10 +-- HTTP-specific statistics counters +local function HTTP_counters() + return { + HTTP_flows_examined = 0, + HTTP_GET_matches = 0, + HTTP_host_matches = 0 + } +end + +local HTTP_strings = strings.strings_to_buf({ + GET = 'GET ', + Host = 'Host:' +}) + +local HTTP_ct = strings.ct_t() + +local function HTTP_accumulate(self, dst, new, pkt) + local md = metadata_get(pkt) + if ((dst.value.packetDeltaCount >= HTTP_scan_threshold or + -- TCP SYN + bit.band(new.value.tcpControlBitsReduced, 0x02) == 0x02)) then + return + end + local state = dst.value.state + if state.examined == 0 then + self.counters.HTTP_flows_examined = + self.counters.HTTP_flows_examined + 1 + state.examined = 1 + end + strings.ct_init(HTTP_ct, pkt.data, pkt.length, md.l4 - pkt.data) + if (state.have_GET == 0 and + strings.search(HTTP_strings.GET, HTTP_ct, true)) then + ffi.copy(dst.value.httpRequestMethod, 'GET') + state.have_GET = 1 + strings.skip_space(HTTP_ct) + local start = strings.ct_at(HTTP_ct) + local _, length = strings.upto_space_or_cr(HTTP_ct) + length = math.min(length, ffi.sizeof(dst.value.httpRequestTarget) - 1) + ffi.copy(dst.value.httpRequestTarget, start, length) + self.counters.HTTP_GET_matches = self.counters.HTTP_GET_matches + 1 + end + if (state.have_GET == 1 and state.have_host == 0 and + strings.search(HTTP_strings.Host, HTTP_ct, true)) then + state.have_host = 1 + strings.skip_space(HTTP_ct) + local start = strings.ct_at(HTTP_ct) + local _, length = strings.upto_space_or_cr(HTTP_ct) + length = math.min(length, ffi.sizeof(dst.value.httpRequestHost) - 1) + ffi.copy(dst.value.httpRequestHost, start, length) + self.counters.HTTP_host_matches = self.counters.HTTP_host_matches + 1 end +end - -- Fill value. - entry.value.flowStartMilliseconds = timestamp - entry.value.flowEndMilliseconds = timestamp - entry.value.packetDeltaCount = 1 - -- Measure bytes starting with the IP header. - entry.value.octetDeltaCount = pkt.length - ethernet_header_size +local function DNS_extract(self, pkt, timestamp, entry, extract_addr_fn) + local md = metadata_get(pkt) + extract_5_tuple(pkt, timestamp, entry, md, extract_addr_fn) + if md.length_delta == 0 and md.frag_offset == 0 then + local dns_hdr = md.l4 + 8 + local msg_size = pkt.data + pkt.length - dns_hdr + dns.extract(dns_hdr, msg_size, entry) + end end -function v6.accumulate(dst, new) - dst.value.flowEndMilliseconds = new.value.flowEndMilliseconds - dst.value.packetDeltaCount = dst.value.packetDeltaCount + 1 - dst.value.octetDeltaCount = - dst.value.octetDeltaCount + new.value.octetDeltaCount +local function DNS_accumulate(self, dst, new) + accumulate_generic(dst, new) +end + +local function can_log(logger) + return logger and logger:can_log() +end + +local function extended_extract(self, pkt, md, timestamp, entry, extract_addr_fn) + extract_5_tuple(pkt, timestamp, entry, md, extract_addr_fn) + local eth_hdr = ffi.cast(ether_header_ptr_t, pkt.data) + + ffi.copy(entry.value.sourceMacAddress, eth_hdr.shost, 6) + ffi.copy(entry.value.postDestinationMacAddress, eth_hdr.dhost, 6) + local mac_to_as = self.maps.mac_to_as + local result = mac_to_as.map:lookup_ptr(eth_hdr.shost) + if result then + entry.value.bgpPrevAdjacentAsNumber = result.value + elseif can_log(mac_to_as.logger) then + mac_to_as.logger:log("unknown source MAC " + ..ethernet:ntop(eth_hdr.shost)) + end + if not ethernet:is_mcast(eth_hdr.dhost) then + local result = mac_to_as.map:lookup_ptr(eth_hdr.dhost) + if result then + entry.value.bgpNextAdjacentAsNumber = result.value + elseif can_log(mac_to_as.logger) then + mac_to_as.logger:log("unknown destination MAC " + ..ethernet:ntop(eth_hdr.dhost)) + end + end + + local vlan = md.vlan + entry.value.vlanId = vlan + if vlan ~= 0 then + local vlan_to_ifindex = self.maps.vlan_to_ifindex + local result = vlan_to_ifindex.map[vlan] + if result then + entry.value.ingressInterface = result.ingress + entry.value.egressInterface = result.egress + elseif can_log(vlan_to_ifindex.logger) then + vlan_to_ifindex.logger:log("unknown vlan "..vlan) + end + end + + if md.proto == IP_PROTO_TCP and md.frag_offset == 0 then + extract_tcp_flags_reduced(md.l4, entry) + end +end + +local asn = ffi.new([[ + union { + char array[4]; + uint32_t number; + } +]]) +local function v4_extended_extract (self, pkt, timestamp, entry) + local md = metadata_get(pkt) + extended_extract(self, pkt, md, timestamp, entry, extract_v4_addr) + + local pfx_to_as = self.maps.pfx4_to_as + local asn = pfx_to_as.map:search_bytes(entry.key.sourceIPv4Address) + if asn then + entry.value.bgpSourceAsNumber = asn + elseif can_log(pfx_to_as.logger) then + pfx_to_as.logger:log("missing AS for source " + ..ipv4:ntop(entry.key.sourceIPv4Address)) + end + local asn = pfx_to_as.map:search_bytes(entry.key.destinationIPv4Address) + if asn then + entry.value.bgpDestinationAsNumber = asn + elseif can_log(pfx_to_as.logger) then + pfx_to_as.logger:log("missing AS for destination " + ..ipv4:ntop(entry.key.destinationIPv4Address)) + end + + entry.value.ipClassOfService = get_ipv4_tos(md.l3) + if md.proto == IP_PROTO_ICMP and md.frag_offset == 0 then + entry.value.icmpTypeCodeIPv4 = get_icmp_typecode(md.l4) + end +end + +local function v4_extended_accumulate (self, dst, new) + accumulate_generic(dst, new) + if dst.key.protocolIdentifier == IP_PROTO_TCP then + accumulate_tcp_flags_reduced(dst, new) + end +end + +local function v6_extended_extract (self, pkt, timestamp, entry) + local md = metadata_get(pkt) + extended_extract(self, pkt, md, timestamp, entry, extract_v6_addr) + + local pfx_to_as = self.maps.pfx6_to_as + local asn = pfx_to_as.map:search_bytes(entry.key.sourceIPv6Address) + if asn then + entry.value.bgpSourceAsNumber = asn + elseif can_log(pfx_to_as.logger) then + pfx_to_as.logger:log("missing AS for source " + ..ipv6:ntop(entry.key.sourceIPv6Address)) + end + local asn = pfx_to_as.map:search_bytes(entry.key.destinationIPv6Address) + if asn then + entry.value.bgpDestinationAsNumber = asn + elseif can_log(pfx_to_as.logger) then + pfx_to_as.logger:log("missing AS for destination " + ..ipv6:ntop(entry.key.destinationIPv6Address)) + end + + entry.value.ipClassOfService = get_ipv6_tc(md.l3) + if md.proto == IP_PROTO_ICMP6 and md.frag_offset == 0 then + entry.value.icmpTypeCodeIPv6 = get_icmp_typecode(md.l4) + end end -function v6.tostring(entry) - local ipv6 = require("lib.protocol.ipv6") - local key = entry.key - local protos = - { [IP_PROTO_TCP]='TCP', [IP_PROTO_UDP]='UDP', [IP_PROTO_SCTP]='SCTP' } - return string.format( - "%s (%d) -> %s (%d) [%s]", - ipv6:ntop(key.sourceIPv6Address), key.sourceTransportPort, - ipv6:ntop(key.destinationIPv6Address), key.destinationTransportPort, - protos[key.protocolIdentifier] or tostring(key.protocolIdentifier)) +local function v6_extended_accumulate (self, dst, new) + accumulate_generic(dst, new) + if dst.key.protocolIdentifier == IP_PROTO_TCP then + accumulate_tcp_flags_reduced(dst, new) + end end +templates = { + v4 = { + id = 256, + filter = "ip", + aggregation_type = 'v4', + keys = { "sourceIPv4Address", + "destinationIPv4Address", + "protocolIdentifier", + "sourceTransportPort", + "destinationTransportPort" }, + values = { "flowStartMilliseconds", + "flowEndMilliseconds", + "packetDeltaCount", + "octetDeltaCount", + "tcpControlBitsReduced" }, + extract = v4_extract, + accumulate = function (self, dst, new) + accumulate_generic(dst, new) + if dst.key.protocolIdentifier == IP_PROTO_TCP then + accumulate_tcp_flags_reduced(dst, new) + end + end, + tostring = function (entry) + local ipv4 = require("lib.protocol.ipv4") + local key = entry.key + local protos = + { [IP_PROTO_TCP]='TCP', [IP_PROTO_UDP]='UDP', [IP_PROTO_SCTP]='SCTP' } + return string.format( + "%s (%d) -> %s (%d) [%s]", + ipv4:ntop(key.sourceIPv4Address), key.sourceTransportPort, + ipv4:ntop(key.destinationIPv4Address), key.destinationTransportPort, + protos[key.protocolIdentifier] or tostring(key.protocolIdentifier)) + end + }, + v4_HTTP = { + id = 257, + filter = "ip and tcp dst port 80", + aggregation_type = 'v4', + keys = { "sourceIPv4Address", + "destinationIPv4Address", + "protocolIdentifier", + "sourceTransportPort", + "destinationTransportPort" }, + values = { "flowStartMilliseconds", + "flowEndMilliseconds", + "packetDeltaCount", + "octetDeltaCount", + "tcpControlBitsReduced", + "httpRequestMethod=8", + "httpRequestHost=32", + "httpRequestTarget=64" }, + state_t = HTTP_state_t, + counters = HTTP_counters(), + extract = v4_extract, + accumulate = function (self, dst, new, pkt) + accumulate_generic(dst, new) + accumulate_tcp_flags_reduced(dst, new) + HTTP_accumulate(self, dst, new, pkt) + end + }, + v4_DNS = { + id = 258, + filter = "ip and udp port 53", + aggregation_type = 'v4', + keys = { "sourceIPv4Address", + "destinationIPv4Address", + "protocolIdentifier", + "sourceTransportPort", + "destinationTransportPort", + "dnsFlagsCodes", + "dnsQuestionCount", + "dnsAnswerCount", + "dnsQuestionName=64", + "dnsQuestionType", + "dnsQuestionClass", + "dnsAnswerName=64", + "dnsAnswerType", + "dnsAnswerClass", + "dnsAnswerTtl", + "dnsAnswerRdata=64", + "dnsAnswerRdataLen" }, + values = { "flowStartMilliseconds", + "flowEndMilliseconds", + "packetDeltaCount", + "octetDeltaCount" }, + extract = function (self, pkt, timestamp, entry) + DNS_extract(self, pkt, timestamp, entry, extract_v4_addr) + end, + accumulate = DNS_accumulate + }, + v4_extended = { + id = 1256, + filter = "ip", + aggregation_type = 'v4', + keys = { "sourceIPv4Address", + "destinationIPv4Address", + "protocolIdentifier", + "sourceTransportPort", + "destinationTransportPort" }, + values = { "flowStartMilliseconds", + "flowEndMilliseconds", + "packetDeltaCount", + "octetDeltaCount", + "sourceMacAddress", + -- This is destinationMacAddress per NetFlowV9 + "postDestinationMacAddress", + "vlanId", + "ipClassOfService", + "bgpSourceAsNumber", + "bgpDestinationAsNumber", + "bgpPrevAdjacentAsNumber", + "bgpNextAdjacentAsNumber", + "tcpControlBitsReduced", + "icmpTypeCodeIPv4", + "ingressInterface", + "egressInterface" }, + require_maps = { 'mac_to_as', 'vlan_to_ifindex', 'pfx4_to_as' }, + extract = v4_extended_extract, + accumulate = v4_extended_accumulate + }, + v6 = { + id = 512, + filter = "ip6", + aggregation_type = 'v6', + keys = { "sourceIPv6Address", + "destinationIPv6Address", + "protocolIdentifier", + "sourceTransportPort", + "destinationTransportPort" }, + values = { "flowStartMilliseconds", + "flowEndMilliseconds", + "packetDeltaCount", + "octetDeltaCount", + "tcpControlBitsReduced" }, + extract = v6_extract, + accumulate = function (self, dst, new) + accumulate_generic(dst, new) + if dst.key.protocolIdentifier == IP_PROTO_TCP then + accumulate_tcp_flags_reduced(dst, new) + end + end, + tostring = function (entry) + local ipv6 = require("lib.protocol.ipv6") + local key = entry.key + local protos = + { [IP_PROTO_TCP]='TCP', [IP_PROTO_UDP]='UDP', [IP_PROTO_SCTP]='SCTP' } + return string.format( + "%s (%d) -> %s (%d) [%s]", + ipv6:ntop(key.sourceIPv6Address), key.sourceTransportPort, + ipv6:ntop(key.destinationIPv6Address), key.destinationTransportPort, + protos[key.protocolIdentifier] or tostring(key.protocolIdentifier)) + end + }, + v6_HTTP = { + id = 513, + filter = "ip6 and tcp dst port 80", + aggregation_type = 'v6', + keys = { "sourceIPv6Address", + "destinationIPv6Address", + "protocolIdentifier", + "sourceTransportPort", + "destinationTransportPort" }, + values = { "flowStartMilliseconds", + "flowEndMilliseconds", + "packetDeltaCount", + "octetDeltaCount", + "tcpControlBitsReduced", + "httpRequestMethod=8", + "httpRequestHost=32", + "httpRequestTarget=64" }, + state_t = HTTP_state_t, + counters = HTTP_counters(), + extract = v6_extract, + accumulate = function (self, dst, new, pkt) + accumulate_generic(dst, new) + accumulate_tcp_flags_reduced(dst, new) + HTTP_accumulate(self, dst, new, pkt) + end + }, + v6_DNS = { + id = 514, + filter = "ip6 and udp port 53", + aggregation_type = 'v6', + keys = { "sourceIPv6Address", + "destinationIPv6Address", + "protocolIdentifier", + "sourceTransportPort", + "destinationTransportPort", + "dnsFlagsCodes", + "dnsQuestionCount", + "dnsAnswerCount", + "dnsQuestionName=64", + "dnsQuestionType", + "dnsQuestionClass", + "dnsAnswerName=64", + "dnsAnswerType", + "dnsAnswerClass", + "dnsAnswerTtl", + "dnsAnswerRdata=64", + "dnsAnswerRdataLen" }, + values = { "flowStartMilliseconds", + "flowEndMilliseconds", + "packetDeltaCount", + "octetDeltaCount" }, + extract = function (self, pkt, timestamp, entry) + DNS_extract(self, pkt, timestamp, entry, extract_v6_addr) + end, + accumulate = DNS_accumulate + }, + v6_extended = { + id = 1512, + filter = "ip6", + aggregation_type = 'v6', + keys = { "sourceIPv6Address", + "destinationIPv6Address", + "protocolIdentifier", + "sourceTransportPort", + "destinationTransportPort" }, + values = { "flowStartMilliseconds", + "flowEndMilliseconds", + "packetDeltaCount", + "octetDeltaCount", + "sourceMacAddress", + -- This is destinationMacAddress per NetFlowV9 + "postDestinationMacAddress", + "vlanId", + "ipClassOfService", + "bgpSourceAsNumber", + "bgpDestinationAsNumber", + "bgpNextAdjacentAsNumber", + "bgpPrevAdjacentAsNumber", + "tcpControlBitsReduced", + "icmpTypeCodeIPv6", + "ingressInterface", + "egressInterface" }, + require_maps = { 'mac_to_as', 'vlan_to_ifindex', 'pfx6_to_as' }, + extract = v6_extended_extract, + accumulate = v6_extended_accumulate, + }, +} + function selftest() print('selftest: apps.ipfix.template') local datagram = require("lib.protocol.datagram") @@ -320,7 +882,8 @@ function selftest() local function test(src_ip, dst_ip, src_port, dst_port) local is_ipv6 = not not src_ip:match(':') - local proto = is_ipv6 and ethertype_ipv6 or ethertype_ipv4 + local proto = is_ipv6 and consts.ethertype_ipv6 or + consts.ethertype_ipv4 local eth = ether:new({ src = ether:pton("00:11:22:33:44:55"), dst = ether:pton("55:44:33:22:11:00"), type = proto }) @@ -329,9 +892,11 @@ function selftest() if is_ipv6 then ip = ipv6:new({ src = ipv6:pton(src_ip), dst = ipv6:pton(dst_ip), next_header = IP_PROTO_UDP, ttl = 64 }) + ip:payload_length(udp:sizeof()) else ip = ipv4:new({ src = ipv4:pton(src_ip), dst = ipv4:pton(dst_ip), protocol = IP_PROTO_UDP, ttl = 64 }) + ip:total_length(ip:total_length() + udp:sizeof()) end local udp = udp:new({ src_port = src_port, dst_port = dst_port }) local dg = datagram:new() @@ -341,13 +906,16 @@ function selftest() dg:push(eth) local pkt = dg:packet() + metadata.add(pkt) + local v4 = make_template_info(templates.v4) + local v6 = make_template_info(templates.v6) assert(v4.match(pkt.data, pkt.length) == not is_ipv6) assert(v6.match(pkt.data, pkt.length) == is_ipv6) local templ = is_ipv6 and v6 or v4 local entry = templ.record_t() local timestamp = 13 - templ.extract(pkt, 13, entry) + templ.extract(templ, pkt, 13, entry) if is_ipv6 then assert(ip:src_eq(entry.key.sourceIPv6Address)) assert(ip:dst_eq(entry.key.destinationIPv6Address)) @@ -361,7 +929,7 @@ function selftest() assert(entry.value.flowStartMilliseconds == timestamp) assert(entry.value.flowEndMilliseconds == timestamp) assert(entry.value.packetDeltaCount == 1) - assert(entry.value.octetDeltaCount == pkt.length - ethernet_header_size) + assert(entry.value.octetDeltaCount == pkt.length - consts.ethernet_header_size) packet.free(pkt) end diff --git a/src/apps/ipfix/test/mac_to_as b/src/apps/ipfix/test/mac_to_as new file mode 100644 index 0000000000..54c496a43e --- /dev/null +++ b/src/apps/ipfix/test/mac_to_as @@ -0,0 +1,2 @@ +321-00:11:22:33:44:55 +654-50:44:33:22:11:00 \ No newline at end of file diff --git a/src/apps/ipfix/test/pfx4_to_as.csv b/src/apps/ipfix/test/pfx4_to_as.csv new file mode 100644 index 0000000000..63a3adf23e --- /dev/null +++ b/src/apps/ipfix/test/pfx4_to_as.csv @@ -0,0 +1,3 @@ +network,autonomous_system_number,autonomous_system_organization +192.168.1.0/24,1234,TEST1 +192.168.1.25/32,5678,TEST2 \ No newline at end of file diff --git a/src/apps/ipfix/test/pfx6_to_as.csv b/src/apps/ipfix/test/pfx6_to_as.csv new file mode 100644 index 0000000000..e4cf3981e2 --- /dev/null +++ b/src/apps/ipfix/test/pfx6_to_as.csv @@ -0,0 +1,3 @@ +network,autonomous_system_number,autonomous_system_organization +2001:4860::0/32,1234,TEST1 +2001:db8::0/32,5678,TEST2 \ No newline at end of file diff --git a/src/apps/ipfix/test/vlan_to_ifindex b/src/apps/ipfix/test/vlan_to_ifindex new file mode 100644 index 0000000000..9226e5eb39 --- /dev/null +++ b/src/apps/ipfix/test/vlan_to_ifindex @@ -0,0 +1,2 @@ +1-2-3 +4-5-6 \ No newline at end of file diff --git a/src/apps/ipv4/fragment.lua b/src/apps/ipv4/fragment.lua index 4e07271bac..ca2a7a5002 100644 --- a/src/apps/ipv4/fragment.lua +++ b/src/apps/ipv4/fragment.lua @@ -165,8 +165,6 @@ function Fragmenter:push () local input, output = self.input.input, self.output.output local max_length = self.mtu + ether_header_len - self.outgoing_ipv4_fragments_alarm:check() - for _ = 1, link.nreadable(input) do local pkt = link.receive(input) local h = ffi.cast(ether_ipv4_header_ptr_t, pkt.data) @@ -191,6 +189,10 @@ function Fragmenter:push () end end +function Fragmenter:tick () + self.outgoing_ipv4_fragments_alarm:check() +end + function selftest() print("selftest: apps.ipv4.fragment") diff --git a/src/apps/ipv4/reassemble.lua b/src/apps/ipv4/reassemble.lua index fa01f7b3be..7e48ebac59 100644 --- a/src/apps/ipv4/reassemble.lua +++ b/src/apps/ipv4/reassemble.lua @@ -296,8 +296,6 @@ end function Reassembler:push () local input, output = self.input.input, self.output.output - self.incoming_ipv4_fragments_alarm:check() - for _ = 1, link.nreadable(input) do local pkt = link.receive(input) local h = ffi.cast(ether_ipv4_header_ptr_t, pkt.data) @@ -322,6 +320,10 @@ function Reassembler:push () packet.free(pkt) end end +end + +function Reassembler:tick () + self.incoming_ipv4_fragments_alarm:check() if self.next_counter_update < engine.now() then -- Update counters every second, but add a bit of jitter to smooth diff --git a/src/apps/ipv6/fragment.lua b/src/apps/ipv6/fragment.lua index eedca13cf2..0dbc4b0e3c 100644 --- a/src/apps/ipv6/fragment.lua +++ b/src/apps/ipv6/fragment.lua @@ -286,8 +286,6 @@ function Fragmenter:push () local input, output = self.input.input, self.output.output local south, north = self.input.south, self.output.north - self.outgoing_ipv6_fragments_alarm:check() - for _ = 1, link.nreadable(input) do local pkt = link.receive(input) local h = ffi.cast(ether_ipv6_header_ptr_t, pkt.data) @@ -353,10 +351,14 @@ function Fragmenter:push () link.transmit(north, pkt) end end + end +end - if self.pmtu_timer() then - self:expire_pmtu() - end +function Fragmenter:tick () + self.outgoing_ipv6_fragments_alarm:check() + + if self.pmtud and self.pmtu_timer() then + self:expire_pmtu() end end diff --git a/src/apps/ipv6/reassemble.lua b/src/apps/ipv6/reassemble.lua index e2c418fe7f..d7004a54f2 100644 --- a/src/apps/ipv6/reassemble.lua +++ b/src/apps/ipv6/reassemble.lua @@ -343,15 +343,6 @@ end function Reassembler:push () local input, output = self.input.input, self.output.output - self.incoming_ipv6_fragments_alarm:check() - - do - local now = self.tsc:stamp() - if now - self.scan_tstamp > self.scan_interval then - self:expire(now) - end - end - for _ = 1, link.nreadable(input) do local pkt = link.receive(input) local h = ffi.cast(ether_ipv6_header_ptr_t, pkt.data) @@ -380,6 +371,17 @@ function Reassembler:push () self:handle_fragment(pkt) packet.free(pkt) end +end + +function Reassembler:tick () + self.incoming_ipv6_fragments_alarm:check() + + do + local now = self.tsc:stamp() + if now - self.scan_tstamp > self.scan_interval then + self:expire(now) + end + end if self.next_counter_update < engine.now() then -- Update counters every second, but add a bit of jitter to smooth diff --git a/src/apps/lwaftr/V4V6.lua b/src/apps/lwaftr/V4V6.lua index 1bf52b5233..14b5f739e5 100644 --- a/src/apps/lwaftr/V4V6.lua +++ b/src/apps/lwaftr/V4V6.lua @@ -193,17 +193,19 @@ local function test_join () config.app(c, 'v4v6', V4V6) config.app(c, 'sink', basic_apps.Sink) - config.link(c, 'source.output -> v4v6.v4') - config.link(c, 'source.output -> v4v6.v6') + config.link(c, 'source.v4 -> v4v6.v4') + config.link(c, 'source.v6 -> v4v6.v6') config.link(c, 'v4v6.output -> sink.input') engine.configure(c) - link.transmit(engine.app_table.source.output.output, arp_pkt()) - link.transmit(engine.app_table.source.output.output, ipv4_pkt()) - link.transmit(engine.app_table.source.output.output, ipv6_pkt()) + for _, output in ipairs{'v4', 'v6'} do + link.transmit(engine.app_table.source.output[output], arp_pkt()) + link.transmit(engine.app_table.source.output[output], ipv4_pkt()) + link.transmit(engine.app_table.source.output[output], ipv6_pkt()) + end engine.main({duration = 0.1, noreport = true}) - assert(link.stats(engine.app_table.sink.input.input).rxpackets == 3) + assert(link.stats(engine.app_table.sink.input.input).rxpackets == 3*2) end function selftest () diff --git a/src/apps/mellanox/benchmark-macs-vlans.sh b/src/apps/mellanox/benchmark-macs-vlans.sh new file mode 100755 index 0000000000..02a5658fd3 --- /dev/null +++ b/src/apps/mellanox/benchmark-macs-vlans.sh @@ -0,0 +1,18 @@ +#!/usr/bin/env bash + +echo i,macs,vlans,txrate,txdrop,txerror,rxrate,rxdrop,rxerror +for i in `seq 1 3`; do + for macs in `seq 4 4 24`; do + for vlans in `seq 0 4 24`; do + out=$(./snabb snsh apps/mellanox/benchmark.snabb -a 81:00.0 -b 81:00.1 -A 6-11 -B 12-17 \ + -m source-fwd -w 6 -q 4 -e $macs -v $vlans) + txrate=$(echo "$out" | grep "Tx Rate is" | cut -d " " -f 4) + txdrop=$(echo "$out" | grep "Tx Drop Rate is" | cut -d " " -f 5) + txerror=$(echo "$out" | grep "Tx Error Rate is" | cut -d " " -f 5) + rxrate=$(echo "$out" | grep "Rx Rate is" | cut -d " " -f 4) + rxdrop=$(echo "$out" | grep "Rx Drop Rate is" | cut -d " " -f 5) + rxerror=$(echo "$out" | grep "Rx Error Rate is" | cut -d " " -f 5) + echo "$i,$macs,$vlans,$txrate,$txdrop,$txerror,$rxrate,$rxdrop,$rxerror" + done + done +done \ No newline at end of file diff --git a/src/apps/mellanox/benchmark-tx-fwd-queues-sizes.sh b/src/apps/mellanox/benchmark-tx-fwd-queues-sizes.sh new file mode 100755 index 0000000000..1d94ed92f6 --- /dev/null +++ b/src/apps/mellanox/benchmark-tx-fwd-queues-sizes.sh @@ -0,0 +1,59 @@ +#!/usr/bin/env bash + +sizes_fine="64 +66 +70 +74 +78 +84 +90 +96 +104 +114 +126 +140 +156 +174 +196 +222 +254 +292 +338 +392 +456 +534 +628 +740 +874 +1034 +1226 +1456" + +sizes_coarse="64 +128 +256 +512 +1024" + +echo i,workers,queues,pktsize,txrate,txdrop,txerror,rxrate,rxdrop,rxerror,fwrate,fwdrop,fwerror +for i in `seq 1 3`; do + for w in `seq 1 6`; do + for q in `seq 1 4`; do + for s in $sizes_coarse; do + out=$(./snabb snsh apps/mellanox/benchmark.snabb -a 81:00.0 -b 81:00.1 -A 6-11 -B 12-17 \ + -m source-fwd -w $w -q $q -s $s -n 100e6) + txrate=$(echo "$out" | grep "Tx Rate is" | cut -d " " -f 4) + txdrop=$(echo "$out" | grep "Tx Drop Rate is" | cut -d " " -f 5) + txerror=$(echo "$out" | grep "Tx Error Rate is" | cut -d " " -f 5) + rxrate=$(echo "$out" | grep "Rx Rate is" | cut -d " " -f 4) + rxdrop=$(echo "$out" | grep "Rx Drop Rate is" | cut -d " " -f 5) + rxerror=$(echo "$out" | grep "Rx Error Rate is" | cut -d " " -f 5) + fwrate=$(echo "$out" | grep "Fw Rate is" | cut -d " " -f 4) + fwdrop=$(echo "$out" | grep "Fw Drop Rate is" | cut -d " " -f 5) + fwerror=$(echo "$out" | grep "Fw Error Rate is" | cut -d " " -f 5) + echo "$i,$w,$q,$s,$txrate,$txdrop,$txerror,$rxrate,$rxdrop,$rxerror,$fwrate,$fwdrop,$fwerror" + #echo $out + done + done + done +done \ No newline at end of file diff --git a/src/apps/mellanox/benchmark-tx-only-numa.sh b/src/apps/mellanox/benchmark-tx-only-numa.sh new file mode 100755 index 0000000000..981f41e16b --- /dev/null +++ b/src/apps/mellanox/benchmark-tx-only-numa.sh @@ -0,0 +1,13 @@ +#!/usr/bin/env bash + +echo i,core_A,core_B,score,unit +for i in `seq 5`; do + for A in `seq 12 23`; do + for B in `seq 12 23`; do + out=$(./snabb snsh apps/mellanox/benchmark.snabb -a 81:00.0 -b 81:00.1 -A $A -B $B) + score=$(echo "$out" | grep "Tx Rate" | cut -d " " -f 4) + unit=$(echo "$out" | grep "Tx Rate" | cut -d " " -f 5) + echo "$i,$A,$B,$score,$unit" + done + done +done \ No newline at end of file diff --git a/src/apps/mellanox/benchmark-tx-only-queues-sizes.sh b/src/apps/mellanox/benchmark-tx-only-queues-sizes.sh new file mode 100755 index 0000000000..f655af58a7 --- /dev/null +++ b/src/apps/mellanox/benchmark-tx-only-queues-sizes.sh @@ -0,0 +1,53 @@ +#!/usr/bin/env bash + +sizes_fine="64 +66 +70 +74 +78 +84 +90 +96 +104 +114 +126 +140 +156 +174 +196 +222 +254 +292 +338 +392 +456 +534 +628 +740 +874 +1034 +1226 +1456" + +sizes_coarse="64 +128 +256 +512 +1024" + +echo i,workers,queues,pktsize,rate,drop,error +for i in `seq 1 3`; do + for w in `seq 1 5`; do + for q in `seq 1 4`; do + for s in $sizes_coarse; do + out=$(./snabb snsh apps/mellanox/benchmark.snabb -a b3:00.0 -b b3:00.1 -A 2-6 -B 7-11 \ + -m source -w $w -q $q -s $s -n 100e6) + rate=$(echo "$out" | grep "Tx Rate is" | cut -d " " -f 4) + drop=$(echo "$out" | grep "Tx Drop Rate is" | cut -d " " -f 5) + error=$(echo "$out" | grep "Tx Error Rate is" | cut -d " " -f 5) + echo "$i,$w,$q,$s,$rate,$drop,$error" + #echo $out + done + done + done +done \ No newline at end of file diff --git a/src/apps/mellanox/benchmark-tx-rx-queues-sizes.sh b/src/apps/mellanox/benchmark-tx-rx-queues-sizes.sh new file mode 100755 index 0000000000..65334776fb --- /dev/null +++ b/src/apps/mellanox/benchmark-tx-rx-queues-sizes.sh @@ -0,0 +1,56 @@ +#!/usr/bin/env bash + +sizes_fine="64 +66 +70 +74 +78 +84 +90 +96 +104 +114 +126 +140 +156 +174 +196 +222 +254 +292 +338 +392 +456 +534 +628 +740 +874 +1034 +1226 +1456" + +sizes_coarse="64 +128 +256 +512 +1024" + +echo i,workers,queues,pktsize,txrate,txdrop,txerror,rxrate,rxdrop,rxerror +for i in `seq 1 3`; do + for w in `seq 1 5`; do + for q in `seq 1 4`; do + for s in $sizes_coarse; do + out=$(./snabb snsh apps/mellanox/benchmark.snabb -a b3:00.0 -b b3:00.1 -A 2-6 -B 7-11 \ + -m source-sink -w $w -q $q -s $s -n 100e6) + txrate=$(echo "$out" | grep "Tx Rate is" | cut -d " " -f 4) + txdrop=$(echo "$out" | grep "Tx Drop Rate is" | cut -d " " -f 5) + txerror=$(echo "$out" | grep "Tx Error Rate is" | cut -d " " -f 5) + rxrate=$(echo "$out" | grep "Rx Rate is" | cut -d " " -f 4) + rxdrop=$(echo "$out" | grep "Rx Drop Rate is" | cut -d " " -f 5) + rxerror=$(echo "$out" | grep "Rx Error Rate is" | cut -d " " -f 5) + echo "$i,$w,$q,$s,$txrate,$txdrop,$txerror,$rxrate,$rxdrop,$rxerror" + #echo $out + done + done + done +done \ No newline at end of file diff --git a/src/apps/mellanox/benchmark.lua b/src/apps/mellanox/benchmark.lua new file mode 100644 index 0000000000..7e5d286459 --- /dev/null +++ b/src/apps/mellanox/benchmark.lua @@ -0,0 +1,433 @@ +-- Use of this source code is governed by the Apache 2.0 license; see COPYING. +module(..., package.seeall) + +local connectx = require("apps.mellanox.connectx") +local worker = require("core.worker") +local basic_apps = require("apps.basic.basic_apps") +local lib = require("core.lib") +local numa = require("lib.numa") +local ffi = require("ffi") +local band = bit.band +local counter = require("core.counter") + + +function sink (pci, cores, nworkers, nqueues, macs, vlans, opt, npackets) + local cores = cpu_set(cores) + local macs = make_set(macs) + local vlans = make_set(vlans) + + local cfg = mlxconf(pci, nworkers*nqueues, macs, vlans, opt) + + local c = config.new() + config.app(c, "ConnectX", connectx.ConnectX, cfg) + engine.configure(c) + + for w=1, nworkers do + worker.start( + "sink"..w, + ('require("apps.mellanox.benchmark").sink_worker(%q, %s, %d, %d)') + :format(pci, take(cores), nqueues, 1+(w-1)*nqueues) + ) + end + + local stats = engine.app_table.ConnectX.stats + + local startline = npackets/10 + engine.main{no_report=true, done=function () -- warmup + return counter.read(stats.rxpackets) >= startline + end} + + local rxpackets_start = counter.read(stats.rxpackets) + local rxdrop_start = counter.read(stats.rxdrop) + local rxerrors_start = counter.read(stats.rxerrors) + + local goal = rxpackets_start + npackets + local start = engine.now() + engine.main{no_report=true, done=function () + return counter.read(stats.rxpackets) >= goal + end} + + local duration = engine.now() - start + local rxpackets = counter.read(stats.rxpackets) - rxpackets_start + local rxdrop = counter.read(stats.rxdrop) - rxdrop_start + local rxerrors = counter.read(stats.rxerrors) - rxerrors_start + print(("Received %s packets in %.2f seconds"):format(lib.comma_value(rxpackets), duration)) + print(("Rx Rate is %.3f Mpps"):format(tonumber(rxpackets) / duration / 1e6)) + print(("Rx Drop Rate is %.3f Mpps"):format(tonumber(rxdrop) / duration / 1e6)) + print(("Rx Error Rate is %.3f Mpps"):format(tonumber(rxerrors) / duration / 1e6)) + io.stdout:flush() + + engine.main{duration=1} +end + +function sink_worker (pci, core, nqueues, idx) + if core then numa.bind_to_cpu(core, 'skip') end + engine.busywait = true + + local c = config.new() + config.app(c, "Sink", basic_apps.Sink) + local q = idx + for _=1, nqueues do + config.app(c, "IO"..q, connectx.IO, {pciaddress=pci, queue="q"..q}) + config.link(c, "IO"..q..".output -> Sink.input"..q) + q = q + 1 + end + engine.configure(c) + + while true do + engine.main{no_report=true, duration=1} + end +end + + +function source (pci, cores, nworkers, nqueues, macs, vlans, opt, npackets, pktsize, dmacs, dips, sips) + local cores = cpu_set(cores) + local macs = make_set(macs) + local dmacs = make_set(dmacs) + local vlans = make_set(vlans) + local dips = make_set(dips) + local sips = make_set(sips) + + local cfg = mlxconf(pci, nworkers*nqueues, macs, vlans, opt) + + local c = config.new() + config.app(c, "ConnectX", connectx.ConnectX, cfg) + engine.configure(c) + + for w=1, nworkers do + worker.start( + "source"..w, + ('require("apps.mellanox.benchmark").source_worker(%q, %s, %d, %d, ' + ..'%q, ' -- pktsize + ..'%s, %s, %s, %s, %s)') -- dst/src mac/vlan/ip + :format(pci, take(cores), nqueues, 1+(w-1)*nqueues, + pktsize, + give(dmacs, nqueues), give(macs, nqueues), + give(vlans, nqueues), + give(dips, nqueues), give(sips, nqueues)) + ) + end + + local stats = engine.app_table.ConnectX.stats + + local startline = npackets/10 + engine.main{no_report=true, done=function () -- warmup + return counter.read(stats.txpackets) >= startline + end} + + local txpackets_start = counter.read(stats.txpackets) + local txdrop_start = counter.read(stats.txdrop) + local txerrors_start = counter.read(stats.txerrors) + + local goal = txpackets_start + npackets + local start = engine.now() + engine.main{no_report=true, done=function () + return counter.read(stats.txpackets) >= goal + end} + + local duration = engine.now() - start + local txpackets = counter.read(stats.txpackets) - txpackets_start + local txdrop = counter.read(stats.txdrop) - txdrop_start + local txerrors = counter.read(stats.txerrors) - txerrors_start + print(("Transmitted %s packets in %.2f seconds"):format(lib.comma_value(txpackets), duration)) + print(("Tx Rate is %.3f Mpps"):format(tonumber(txpackets) / duration / 1e6)) + print(("Tx Drop Rate is %.3f Mpps"):format(tonumber(txdrop) / duration / 1e6)) + print(("Tx Error Rate is %.3f Mpps"):format(tonumber(txerrors) / duration / 1e6)) + io.stdout:flush() + + engine.main{no_report=true, duration=1} +end + +function source_linger (...) + source(...) + engine.main() +end + +function source_worker (pci, core, nqueues, idx, pktsize, dmacs, smacs, vlans, dips, sips) + if core then numa.bind_to_cpu(core, 'skip') end + engine.busywait = true + + local c = config.new() + config.app(c, "Source", Source, { + packetsize = pktsize, + dmacs = dmacs, + smacs = smacs, + vlans = vlans, + dips = dips, + sips = sips + }) + local q = idx + for _=1, nqueues do + config.app(c, "IO"..q, connectx.IO, {pciaddress=pci, queue="q"..q, packetblaster=true}) + config.link(c, "Source.output"..q.." -> IO"..q..".input") + q = q + 1 + end + engine.configure(c) + + while true do + engine.main{no_report=true, duration=1} + end +end + +Source = { + config = { + packetsize = {required=true}, + dmacs = {required=true}, + smacs = {required=true}, + vlans = {required=true}, + dips = {required=true}, + sips = {required=true}, + buffersize = {default=1024} -- must be power of two + }, + dot1q_t = ffi.typeof[[struct { + uint16_t pcp_dei_vid; + uint16_t ethertype; + } __attribute__((packed))]] +} + +function Source:default_dmacs () return {"02:00:00:00:00:01"} end +function Source:default_smacs () return {"02:00:00:00:00:02"} end +function Source:default_vlans () return {0} end +function Source:default_dips () + local ips = {} + for i=1, 200 do ips[#ips+1] = "10.0.1."..i end + return ips +end +function Source:default_sips () + local ips = {} + for i=1, 200 do ips[#ips+1] = "10.0.2."..i end + return ips +end + +function Source:new (conf) + local self = setmetatable({}, {__index=Source}) + local size = tonumber(conf.packetsize) + if size then + self.sizes = make_set{size} + elseif conf.packetsize == 'IMIX' then + self.sizes = make_set{64, 64, 64, 64, 64, 64, 64, 576, 576, 576, 576, 1500} + else + error("NYI") + end + self.dmacs = make_set(#conf.dmacs > 0 and conf.dmacs or self:default_dmacs()) + self.smacs = make_set(#conf.smacs > 0 and conf.smacs or self:default_smacs()) + self.vlans = make_set(#conf.vlans > 0 and conf.vlans or self:default_vlans()) + self.dips = make_set(#conf.dips > 0 and conf.dips or self:default_dips()) + self.sips = make_set(#conf.sips > 0 and conf.sips or self:default_sips()) + self.buffersize = conf.buffersize + self.packets = ffi.new("struct packet *[?]", self.buffersize) + for i=0, self.buffersize-1 do + self.packets[i] = self:make_packet() + end + self.cursor = 0 + return self +end + +function Source:make_packet () + local ethernet = require("lib.protocol.ethernet") + local ipv4 = require("lib.protocol.ipv4") + local size = take(self.sizes) - 4 -- minus (4 byte CRC) + assert(size > (ethernet:sizeof() + ffi.sizeof(self.dot1q_t) + ipv4:sizeof())) + local eth = ethernet:new{ + dst = ethernet:pton(take(self.dmacs)), + src = ethernet:pton(take(self.smacs)), + type = 0x8100 -- dot1q + } + local dot1q = ffi.new(self.dot1q_t) + dot1q.pcp_dei_vid = lib.htons(take(self.vlans)) + dot1q.ethertype = lib.htons(0x0800) -- IPv4 + local ip = ipv4:new{ + dst = ipv4:pton(take(self.dips)), + src = ipv4:pton(take(self.sips)), + ttl = 64, + total_length = size - (eth:sizeof() + ffi.sizeof(dot1q)) + } + ip:checksum() + local p = packet.allocate() + packet.append(p, eth:header(), eth:sizeof()) + packet.append(p, dot1q, ffi.sizeof(dot1q)) + packet.append(p, ip:header(), ip:sizeof()) + packet.resize(p, size) + return p +end + +function Source:pull () + local cursor = self.cursor + local mask = self.buffersize-1 + local packets = self.packets + for _, output in pairs(self.output) do + while not link.full(output) do + link.transmit(output, packet.clone(packets[band(cursor,mask)])) + --link.transmit(output, packets[band(cursor,mask)]) + cursor = cursor + 1 + end + end + self.cursor = band(cursor, mask) +end + +function fwd (pci, cores, nworkers, nqueues, macs, vlans, opt, npackets) + local cores = cpu_set(cores) + local macs = make_set(macs) + local vlans = make_set(vlans) + + local cfg = mlxconf(pci, nworkers*nqueues, macs, vlans, opt) + + local c = config.new() + config.app(c, "ConnectX", connectx.ConnectX, cfg) + engine.configure(c) + + for w=1, nworkers do + worker.start( + "sink"..w, + ('require("apps.mellanox.benchmark").fwd_worker(%q, %s, %d, %d)') + :format(pci, take(cores), nqueues, 1+(w-1)*nqueues) + ) + end + + local stats = engine.app_table.ConnectX.stats + + local startline = npackets/10 + engine.main{no_report=true, done=function () -- warmup + return counter.read(stats.rxpackets) >= startline + end} + + local rxpackets_start = counter.read(stats.rxpackets) + local rxdrop_start = counter.read(stats.rxdrop) + local rxerrors_start = counter.read(stats.rxerrors) + local txpackets_start = counter.read(stats.txpackets) + local txdrop_start = counter.read(stats.txdrop) + local txerrors_start = counter.read(stats.txerrors) + + local goal = rxpackets_start + npackets + local start = engine.now() + engine.main{no_report=true, done=function () + return counter.read(stats.rxpackets) >= goal + end} + + local duration = engine.now() - start + local rxpackets = counter.read(stats.rxpackets) - rxpackets_start + local rxdrop = counter.read(stats.rxdrop) - rxdrop_start + local rxerrors = counter.read(stats.rxerrors) - rxerrors_start + print(("Received %s packets in %.2f seconds"):format(lib.comma_value(rxpackets), duration)) + print(("Rx Rate is %.3f Mpps"):format(tonumber(rxpackets) / duration / 1e6)) + print(("Rx Drop Rate is %.3f Mpps"):format(tonumber(rxdrop) / duration / 1e6)) + print(("Rx Error Rate is %.3f Mpps"):format(tonumber(rxerrors) / duration / 1e6)) + local txpackets = counter.read(stats.txpackets) - txpackets_start + local txdrop = counter.read(stats.txdrop) - txdrop_start + local txerrors = counter.read(stats.txerrors) - txerrors_start + print(("Forwarded %s packets in %.2f seconds"):format(lib.comma_value(txpackets), duration)) + print(("Fw Rate is %.3f Mpps"):format(tonumber(txpackets) / duration / 1e6)) + print(("Fw Drop Rate is %.3f Mpps"):format(tonumber(txdrop) / duration / 1e6)) + print(("Fw Error Rate is %.3f Mpps"):format(tonumber(txerrors) / duration / 1e6)) + io.stdout:flush() + + engine.main{duration=1} +end + +function fwd_worker (pci, core, nqueues, idx) + if core then numa.bind_to_cpu(core, 'skip') end + engine.busywait = true + + local c = config.new() + config.app(c, "Forward", Forward) + local q = idx + for _=1, nqueues do + config.app(c, "IO"..q, connectx.IO, {pciaddress=pci, queue="q"..q}) + config.link(c, "IO"..q..".output -> Forward.input"..q) + config.link(c, "Forward.output"..q.." -> IO"..q..".input") + q = q + 1 + end + engine.configure(c) + + while true do + engine.main{no_report=true, duration=1} + end +end + +Forward = {} + +local ethernet = require("lib.protocol.ethernet") + +function Forward:new (conf) + local self = setmetatable({}, {__index=Forward}) + self.eth = ethernet:new{} + return self +end + +function Forward:link () + self.input_links, self.output_links = {}, {} + for name, input in pairs(self.input) do + if type(name) == 'string' then + local q = name:match("input([0-9]+)") + self.input_links[#self.input_links+1] = input + self.output_links[#self.output_links+1] = self.output["output"..q] + end + end +end + +function Forward:push () + for i = 1, #self.input_links do + local input, output = self.input_links[i], self.output_links[i] + while not link.empty(input) do + local p = link.receive(input) + local eth = self.eth:new_from_mem(p.data, p.length) + eth:swap() + link.transmit(output, p) + end + end +end + + +function mlxconf (pci, nqueues, macs, vlans, opt, force_opt) + local queues = {} + for q=1, nqueues do + queues[q] = {id="q"..q, mac=take(macs), vlan=take(vlans)} + --print(pci, queues[q].id, queues[q].mac, queues[q].vlan) + end + + local cfg = {} + for k,v in pairs(opt or {}) do + cfg[k] = v + end + for k,v in pairs(force_opt or {}) do + cfg[k] = v + end + cfg.pciaddress = pci + cfg.queues = queues + + return cfg +end + +function make_set (items) + return {idx=1, items=items or {}} +end + +function take (set) + local item = set.items[set.idx] + set.idx = (set.idx % #set.items) + 1 + return item +end + +function give (set, n) + local a = "{" + for _=1, n do + local item = take(set) + if item then + local s = (type(item) == 'string') + and ("%q"):format(item) + or ("%s"):format(item) + a = a..(" %s,"):format(s) + else + break + end + end + return a.."}" +end + +function cpu_set (s) + local cores = {} + for core in pairs(numa.parse_cpuset(s or "")) do + cores[#cores+1] = core + end + return make_set(cores) +end \ No newline at end of file diff --git a/src/apps/mellanox/benchmark.snabb b/src/apps/mellanox/benchmark.snabb new file mode 100644 index 0000000000..2efbdd32bb --- /dev/null +++ b/src/apps/mellanox/benchmark.snabb @@ -0,0 +1,137 @@ +#!../../snabb snsh + +local worker = require("core.worker") +local lib = require("core.lib") + +local mode = 'source-sink' +local pci0, pci1 +local cores0, cores1 +local nworkers = 1 +local nqueues = 1 +local npackets = 100e6 +local pktsize = 64 +local nmacs +local nvlans +local mlxopts + +local long_opts = { + mode = "m", + ['pci-a'] = "a", + ['pci-b'] = "b", + ['cores-a'] = "A", + ['cores-b'] = "B", + nworkers = "w", + nqueues = "q", + npackets = "n", + pktsize = "s", + nmacs = "e", + nvlans = "v", + opt = "o", + help = "h", +} +local opts = "m:a:b:A:B:w:q:n:s:e:v:o:h" +local function print_usage () + for long, short in pairs(long_opts) do + print("--"..long, "-"..short) + end +end + +local opt = {} +function opt.m (arg) mode = arg end +function opt.a (arg) pci0 = arg end +function opt.b (arg) pci1 = arg end +function opt.A (arg) cores0 = arg end +function opt.B (arg) cores1 = arg end +function opt.w (arg) nworkers = tonumber(arg) end +function opt.q (arg) nqueues = tonumber(arg) end +function opt.n (arg) npackets = tonumber(arg) end +function opt.s (arg) pktsize = arg end +function opt.e (arg) nmacs = tonumber(arg) end +function opt.v (arg) nvlans = tonumber(arg) end +function opt.o (arg) mlxopts = arg end +function opt.h (arg) print_usage() main.exit(0) end + +main.parameters = lib.dogetopt(main.parameters, opt, opts, long_opts) + +assert(nworkers >= 1, "nworkers < 1") +assert(nqueues >= 1, "nqueues < 1") +assert(npackets >= 1, "npackets < 1") + +if type(pktsize) == 'string' then + pktsize = ("%q"):format(pktsize) +end + +local macs, dmacs, vlans +if nmacs and nmacs > 0 then + assert(nmacs <= 0xff, "nmacs too large") + macs = '{' + dmacs = '{' + for mac=1, nmacs do + macs = ("%s'02:01:00:00:00:%02X',"):format(macs, mac) + dmacs = ("%s'02:02:00:00:00:%02X',"):format(dmacs, mac) + end + macs = macs..'}' + dmacs = dmacs..'}' +end +if nvlans and nvlans > 0 then + vlans = '{' + for vlan=1, nvlans do + vlans = ("%s%d,"):format(vlans, vlan) + end + vlans = vlans..'}' +end + +if mode == 'source-sink' then + + worker.start("sink", ('require("apps.mellanox.benchmark").sink(%q, %q, %d, %d, %s, %s, %s, %d)') + :format(pci0, cores0, nworkers, nqueues, dmacs, vlans, mlxopts, npackets)) + + worker.start("source", ('require("apps.mellanox.benchmark").source_linger(%q, %q, %d, %d, %s, %s, %s, %d, %s, %s)') + :format(pci1, cores1, nworkers, nqueues, macs, vlans, mlxopts, npackets, pktsize, dmacs)) + + engine.main{done = function () + return not worker.status()["sink"].alive + end} + +elseif mode == 'source-fwd' then + + worker.start("forward", ('require("apps.mellanox.benchmark").fwd(%q, %q, %d, %d, %s, %s, %s, %d)') + :format(pci0, cores0, nworkers, nqueues, dmacs, vlans, mlxopts, npackets)) + + worker.start("source", ('require("apps.mellanox.benchmark").source_linger(%q, %q, %d, %d, %s, %s, %s, %d, %s, %s)') + :format(pci1, cores1, nworkers, nqueues, macs, vlans, mlxopts, npackets, pktsize, dmacs)) + + engine.main{done = function () + return not worker.status()["forward"].alive + end} + +elseif mode == 'source' then + + worker.start("source", ('require("apps.mellanox.benchmark").source(%q, %q, %d, %d, %s, %s, %s, %d, %s, %s)') + :format(pci0, cores0, nworkers, nqueues, macs, vlans, mlxopts, npackets, pktsize, dmacs)) + + engine.main{done = function () + return not worker.status()["source"].alive + end} + +elseif mode == 'sink' then + + worker.start("sink", ('require("apps.mellanox.benchmark").sink(%q, %q, %d, %d, %s, %s, %s, %d)') + :format(pci0, cores0, nworkers, nqueues, dmacs, vlans, mlxopts, npackets)) + + engine.main{done = function () + return not worker.status()["sink"].alive + end} + +elseif mode == 'fwd' then + + worker.start("forward", ('require("apps.mellanox.benchmark").fwd(%q, %q, %d, %d, %s, %s, %s, %d)') + :format(pci0, cores0, nworkers, nqueues, dmacs, vlans, mlxopts, npackets)) + + engine.main{done = function () + return not worker.status()["forward"].alive + end} + +else + error("NYI: mode "..mode) +end \ No newline at end of file diff --git a/src/apps/mellanox/connectx.lua b/src/apps/mellanox/connectx.lua index 9a9b2e1184..3961fce297 100644 --- a/src/apps/mellanox/connectx.lua +++ b/src/apps/mellanox/connectx.lua @@ -50,6 +50,7 @@ local band, bor, shl, shr, bswap, bnot = bit.band, bit.bor, bit.lshift, bit.rshift, bit.bswap, bit.bnot local cast, typeof = ffi.cast, ffi.typeof +local debug_info = false -- Print into messages local debug_trace = false -- Print trace messages local debug_hexdump = false -- Print hexdumps (in Linux mlx5 format) @@ -149,14 +150,13 @@ local cxq_t = ffi.typeof([[ // Transmit state struct packet *tx[64*1024]; // packets queued for transmit - uint16_t next_tx_wqeid; // work queue ID for next transmit descriptor + uint16_t next_tx_wqeid; // work queue ID for next transmit descriptor uint64_t *bf_next, *bf_alt; // "blue flame" to ring doorbell (alternating) // Receive state struct packet *rx[64*1024]; // packets queued for receive - uint16_t next_rx_wqeid; // work queue ID for next receive descriptor - uint16_t next_rx_cqeid; // completion queue ID of next completed packet - int rx_mine; // CQE ownership value that means software-owned + uint16_t next_rx_wqeid; // work queue ID for next receive descriptor + uint32_t rx_cqcc; // consumer counter of RX CQ } ]]) @@ -209,17 +209,39 @@ local mlx_types = { ["0x101d" ] = 6, -- ConnectX6 } +ConnectX.config = { + pciaddress = { required = true }, + sendq_size = { default = 1024 }, + recvq_size = { default = 1024 }, + mtu = { default = 9500 }, + fc_rx_enable = { default = false }, + fc_tx_enable = { default = false }, + queues = { required = true }, + macvlan = { default = false }, +} +local queue_config = { + id = { required = true }, + mac = { default = nil }, + vlan = { default = nil }, + enable_counters = { default = true }, +} + function ConnectX:new (conf) local self = setmetatable({}, self) + local queues = {} + for _, queue in ipairs(conf.queues) do + table.insert(queues, lib.parse(queue, queue_config)) + end + local pciaddress = pci.qualified(conf.pciaddress) local device_info = pci.device_info(pciaddress) self.mlx = assert(mlx_types[device_info.device], "Unsupported device "..device_info.device) - local sendq_size = conf.sendq_size or 1024 - local recvq_size = conf.recvq_size or 1024 + local sendq_size = conf.sendq_size + local recvq_size = conf.recvq_size - local mtu = conf.mtu or 9500 + local mtu = conf.mtu -- Perform a hard reset of the device to bring it into a blank state. -- @@ -265,6 +287,8 @@ function ConnectX:new (conf) hca:set_port_mtu(mtu) hca:modify_nic_vport_context(mtu, true, true, true) + hca:set_port_flow_control(conf.fc_rx_enable, conf.fc_tx_enable) + -- Create basic objects that we need -- local uar = hca:alloc_uar() @@ -280,8 +304,9 @@ function ConnectX:new (conf) local rqlist = {} local rqs = {} - -- List of queue counter IDs (ConnectX5 and up) - local counter_set_ids = {} + -- List of queue counter IDs and their corresponding queue IDs from + -- the configuration (ConnectX5 and up) + local q_counters = {} -- Enable MAC/VLAN switching? local usemac = false @@ -328,9 +353,10 @@ function ConnectX:new (conf) -- Create the queue objects local tis = hca:create_tis(0, tdomain) local counter_set_id - if self.mlx > 4 then + if self.mlx > 4 and queue.enable_counters then counter_set_id = hca:alloc_q_counter() - table.insert(counter_set_ids, counter_set_id) + table.insert(q_counters, { counter_id = counter_set_id, + queue_id = queue.id }) end -- XXX order check cxq.sqn = hca:create_sq(scqn, pd, sq_stride, sendq_size, @@ -513,6 +539,11 @@ function ConnectX:new (conf) txdrop = {counter}, txerrors = {counter}, } + -- Create per-queue drop counters named by the queue identifiers in + -- the configuration. + for _, queue in ipairs(conf.queues) do + frame["rxdrop_"..queue.id] = {counter} + end self.stats = shm.create_frame("pci/"..pciaddress, frame) -- Create separate HCAs to retreive port statistics. Those @@ -558,17 +589,18 @@ function ConnectX:new (conf) } -- Empty for ConnectX4 - for _, id in ipairs(counter_set_ids) do + for _, q_counter in ipairs(q_counters) do + local per_q_rxdrop = self.stats["rxdrop_"..q_counter.queue_id] table.insert(self.stats_reqs, { start_fn = HCA.query_q_counter_start, finish_fn = HCA.query_q_counter_finish, - args = { set_id = id }, + args = q_counter.counter_id, process_fn = function(r, stats) -- Incremental update relies on query_q_counter to -- clear the counter after read. - counter.set(stats.rxdrop, - counter.read(stats.rxdrop) + r.out_of_buffer) + counter.add(stats.rxdrop, r.out_of_buffer) + counter.add(per_q_rxdrop, r.out_of_buffer) end }) end @@ -617,6 +649,7 @@ function ConnectX:new (conf) function self:pull () if self.sync_timer() then self:sync_stats() + eq:poll() end end @@ -779,6 +812,9 @@ end -- Provide the NIC with freshly allocated memory. function HCA:alloc_pages (num_pages) assert(num_pages > 0) + if debug_info then + print(("Allocating %d pages to HW"):format(num_pages)) + end self:command("MANAGE_PAGES", 0x14 + num_pages*8, 0x0C) :input("opcode", 0x00, 31, 16, 0x108) :input("opmod", 0x04, 15, 0, 1) -- allocate mode @@ -791,6 +827,20 @@ function HCA:alloc_pages (num_pages) self:execute() end +function HCA:free_pages (num_pages) + assert(num_pages > 0) + if debug_info then + print(("Reclaiming %d pages from HW"):format(num_pages)) + end + self:command("MANAGE_PAGES", 0x0C, 0x10 + num_pages*8) + :input("opcode", 0x00, 31, 16, 0x108) + :input("opmod", 0x04, 15, 0, 2) -- return pages + :input("input_num_entries", 0x0C, 31, 0, num_pages, "input_num_entries") + :execute() + local num_entries = self:output(0x08, 31, 0) + -- TODO: deallocate DMA pages +end + -- Query the NIC capabilities (maximum or current setting). function HCA:query_hca_general_cap (max_or_current) local opmod = assert(({max=0, current=1})[max_or_current]) @@ -884,67 +934,145 @@ end -- Event queues --------------------------------------------------------------- +-- Event Queue Entry (EQE) +local eqe_t = ffi.typeof([[ + struct { + uint8_t reserved1; + uint8_t event_type; + uint8_t reserved2; + uint8_t event_sub_type; + uint32_t reserved3[7]; + uint32_t event_data[7]; + uint16_t reserved4; + uint8_t signature; + uint8_t owner; + } +]]) + -- Create an event queue that can be accessed via the given UAR page number. function HCA:create_eq (uar) local numpages = 1 local log_eq_size = 7 -- 128 entries - local ptr, phy = memory.dma_alloc(4096, 4096) -- memory for entries + local byte_size = 2^log_eq_size * ffi.sizeof(eqe_t) + local ptr, phy = memory.dma_alloc(byte_size, 4096) -- memory for entries + events = bits({ + CQError = 0x04, + PortStateChange = 0x09, + PageRequest = 0x0B, + }) self:command("CREATE_EQ", 0x10C + numpages*8, 0x0C) :input("opcode", 0x00, 31, 16, 0x301) + :input("oi", 0x10 + 0x00, 17, 17, 1) -- overrun ignore :input("log_eq_size", 0x10 + 0x0C, 28, 24, log_eq_size) :input("uar_page", 0x10 + 0x0C, 23, 0, uar) :input("log_page_size", 0x10 + 0x18, 28, 24, 2) -- XXX best value? 0 or max? - :input("event bitmask", 0x10 + 0x5C, 31, 0, bits({PageRequest=0xB})) -- XXX more events? + :input("event bitmask", 0x5C, 31, 0, events) :input("pas[0] high", 0x110, 31, 0, ptrbits(phy, 63, 32)) :input("pas[0] low", 0x114, 31, 0, ptrbits(phy, 31, 0)) :execute() local eqn = self:output(0x08, 7, 0) - return eq:new(eqn, ptr, 2^log_eq_size) + return eq:new(eqn, ptr, log_eq_size, self) end --- Event Queue Entry (EQE) -local eqe_t = ffi.typeof([[ - struct { - uint16_t event_type; - uint16_t event_sub_type; - uint32_t event_data; - uint16_t pad; - uint8_t signature; - uint8_t owner; - } ]] ) - eq = {} eq.__index = eq -- Create event queue object. -function eq:new (eqn, pointer, nentries) +function eq:new (eqn, pointer, log_size, hca) + local nentries = 2^log_size local ring = ffi.cast(ffi.typeof("$*", eqe_t), pointer) - for i = 0, nentries-1 do + for i = 0, nentries - 1 do + -- Owner = HW ring[i].owner = 1 end + local mask = nentries - 1 return setmetatable({eqn = eqn, ring = ring, index = 0, - n = nentries}, + log_size = log_size, + mask = nentries - 1, + hca = hca, + }, self) end +function eq:sw_value () + return band(shr(self.index, self.log_size), 1) +end + +function eq:entry () + local slot = band(self.index, self.mask) + return self.ring[slot] +end + -- Poll the queue for events. -function eq:poll() - print("Polling EQ") - local eqe = self.ring[self.index] - while eqe.owner == 0 and eqe.event_type ~= 0xFF do +function eq:poll () + local eqe = self:entry() + while eqe.owner == self:sw_value() do + self:handle_event(eqe) self.index = self.index + 1 - eqe = self.ring[self.index % self.n] - self:event(eqe) + eqe = self:entry() end - print("done polling EQ") end -- Handle an event. -function eq:event () - print(("Got event %s.%s"):format(eqe.event_type, eqe.event_sub_type)) - error("Event handling not yet implemented") +local event_page_req = ffi.cdef([[ + struct event_page_req { + uint16_t reserved1; + uint16_t function_id; + uint32_t num_pages; + uint32_t reserved2[5]; + } +]]) +local event_port_change = ffi.cdef([[ + struct event_port_change { + uint32_t reserved1[2]; + uint8_t port_num; + uint8_t reserved2[3]; + uint32_t reserved2[4]; + } +]]) +local port_status = { + [1] = "down", + [4] = "up" +} +local event_cq_error = ffi.cdef([[ + struct event_cq_error { + uint32_t cqn; + uint32_t reserved1; + uint8_t reserved2[3]; + uint8_t syndrome; + uint32_t reserved3[4]; + } +]]) +local cq_errors = { + [1] = "overrun", + [2] = "access violation" +} +function eq:handle_event (eqe) + if eqe.event_type == 0x04 then + local cq_error = cast(typeof("struct event_cq_error *"), eqe.event_data) + local cqn = bswap(cq_error.cqn) + error(("Error on completion queue #%d: %s"):format(cqn, cq_errors[cq_error.syndrome])) + elseif eqe.event_type == 0x09 then + if debug_info then + local port_change = cast(typeof("struct event_port_change *"), eqe.event_data) + local port = shr(port_change.port_num, 4) + print(("Port %d changed state to %s"):format(port, port_status[eqe.event_sub_type])) + end + elseif eqe.event_type == 0xB then + local page_req = cast(typeof("struct event_page_req *"), eqe.event_data) + local num_pages = bswap(page_req.num_pages) + if num_pages < 0 then + num_pages = -num_pages + self.hca:free_pages(num_pages) + else + self.hca:alloc_pages(num_pages) + end + else + error(("Received unexpected event type 0x%02x, subtype 0x%02x"):format(eqe.event_type, + eqe.event_sub_type)) + end end --------------------------------------------------------------- @@ -1242,6 +1370,12 @@ IO.__index = IO -- lib.hardware.pci.device_info driver = IO +IO.config = { + pciaddress = {required=true}, + queue = {required=true}, + packetblaster = {default=false} +} + function IO:new (conf) local self = setmetatable({}, self) @@ -1328,6 +1462,17 @@ function IO:new (conf) close() end + -- Configure self as packetblaster? + if conf.packetblaster then + self.push = nil + self.pull = function (self) + if activate() then + sq:blast(self.input.input or self.input.rx) + deactivate() + end + end + end + return self end @@ -1340,9 +1485,11 @@ function RQ:new (cxq) local rq = {} local mask = cxq.rqsize - 1 - -- Return the transmit queue slot for the given WQE ID. - local function slot (wqeid) - return band(wqeid, mask) + -- Return the queue slot for the given consumer counter for either + -- the CQ or the WQ. This assumes that both queues have the same + -- size. + local function slot (cc) + return band(cc, mask) end -- Refill with buffers @@ -1366,40 +1513,44 @@ function RQ:new (cxq) end end + local log2_rqsize = log2size(cxq.rqsize) + local function sw_owned () + -- The value of the ownership flag that indicates owned by SW for + -- the current consumer counter is flipped every time the counter + -- wraps around the receive queue. + return band(shr(cxq.rx_cqcc, log2_rqsize), 1) + end + local function have_input () - local c = cxq.rcq[cxq.next_rx_cqeid] + local c = cxq.rcq[slot(cxq.rx_cqcc)] local owner = bit.band(1, c.u8[0x3F]) - return owner == cxq.rx_mine + return owner == sw_owned() end function rq:receive (l) local limit = engine.pull_npackets - while have_input() and limit > 0 and not link.full(l) do + while limit > 0 and have_input() do -- Find the next completion entry. - local c = cxq.rcq[cxq.next_rx_cqeid] + local c = cxq.rcq[slot(cxq.rx_cqcc)] limit = limit - 1 -- Advance to next completion. -- Note: assumes sqsize == cqsize - cxq.next_rx_cqeid = slot(cxq.next_rx_cqeid + 1) - -- Toggle the ownership value if the CQ wraps around. - if cxq.next_rx_cqeid == 0 then - cxq.rx_mine = (cxq.rx_mine + 1) % 2 - end + cxq.rx_cqcc = cxq.rx_cqcc + 1 -- Decode the completion entry. local opcode = shr(c.u8[0x3F], 4) local len = bswap(c.u32[0x2C/4]) local wqeid = shr(bswap(c.u32[0x3C/4]), 16) local idx = slot(wqeid) - if opcode == 0 or opcode == 2 then + if band(opcode, 0xfd) == 0 then -- opcode == 0 or opcode == 2 -- Successful receive local p = cxq.rx[idx] - assert(p ~= nil) + -- assert(p ~= nil) p.length = len link.transmit(l, p) cxq.rx[idx] = nil elseif opcode == 13 or opcode == 14 then -- Error on receive - assert(cxq.rx[idx] ~= nil) + -- assert(cxq.rx[idx] ~= nil) packet.free(cxq.rx[idx]) cxq.rx[idx] = nil local syndromes = { @@ -1422,10 +1573,6 @@ function RQ:new (cxq) end end - function rq:ring_doorbell () - doorbell[0].receive = bswap(next_buffer) - end - return rq end @@ -1482,7 +1629,7 @@ function SQ:new (cxq, mmio) end -- Ring the doorbell if we enqueued new packets. if cxq.next_tx_wqeid ~= start_wqeid then - local current_packet = slot(cxq.next_tx_wqeid + cxq.sqsize-1) + local current_packet = slot(cxq.next_tx_wqeid + mask) cxq.doorbell.send = bswap(cxq.next_tx_wqeid) cxq.bf_next[0] = cxq.swq[current_packet].u64[0] -- Switch next/alternate blue flame register for next time @@ -1496,13 +1643,78 @@ function SQ:new (cxq, mmio) local opcode = cxq.scq[0].u8[0x38] if opcode == 0x0A then local wqeid = shr(bswap(cxq.scq[0].u32[0x3C/4]), 16) - while next_reclaim ~= wqeid % cxq.sqsize do - assert(cxq.tx[next_reclaim] ~= nil) + while next_reclaim ~= slot(wqeid) do + -- assert(cxq.tx[next_reclaim] ~= nil) packet.free(cxq.tx[next_reclaim]) cxq.tx[next_reclaim] = nil - next_reclaim = tonumber(slot(next_reclaim + 1)) + next_reclaim = slot(next_reclaim + 1) + end + end + end + + -- Packetblaster: blast packets from link out of send queue. + function sq:blast (l) + local kickoff = sq:blast_load(l) + + -- Get current send queue tail (hardware controlled) + local opcode = cxq.scq[0].u8[0x38] + if opcode == 0x0A then + local wqeid = shr(bswap(cxq.scq[0].u32[0x3C/4]), 16) + + -- Keep send queue topped up + local next_slot = slot(cxq.next_tx_wqeid) + while next_slot ~= slot(wqeid) do + local wqe = cxq.swq[next_slot] + -- Update control segment + wqe.u32[0] = bswap(shl(cxq.next_tx_wqeid, 8) + 0x0A) + -- Advance counters + cxq.next_tx_wqeid = cxq.next_tx_wqeid + 1 + next_slot = slot(cxq.next_tx_wqeid) end end + + if opcode == 0x0A or kickoff then + -- Ring the doorbell + local current_packet = slot(cxq.next_tx_wqeid + mask) + cxq.doorbell.send = bswap(cxq.next_tx_wqeid) + cxq.bf_next[0] = cxq.swq[current_packet].u64[0] + -- Switch next/alternate blue flame register for next time + cxq.bf_next, cxq.bf_alt = cxq.bf_alt, cxq.bf_next + end + end + + -- Packetblaster: load packets from link into send queue. + local loaded = 0 + function sq:blast_load (l) + while loaded < cxq.sqsize and not link.empty(l) do + local p = link.receive(l) + local next_slot = slot(cxq.next_tx_wqeid) + local wqe = cxq.swq[next_slot] + + -- Construct a 64-byte transmit descriptor. + -- This is in three parts: Control, Ethernet, Data. + -- The Ethernet part includes some inline data. + + -- Control segment + wqe.u32[0] = bswap(shl(cxq.next_tx_wqeid, 8) + 0x0A) + wqe.u32[1] = bswap(shl(cxq.sqn, 8) + 4) + wqe.u32[2] = bswap(shl(2, 2)) -- completion always + -- Ethernet segment + local ninline = 16 + wqe.u32[7] = bswap(shl(ninline, 16)) + ffi.copy(wqe.u8 + 0x1E, p.data, ninline) + -- Send Data Segment (inline data) + wqe.u32[12] = bswap(p.length - ninline) + wqe.u32[13] = bswap(cxq.rlkey) + local phy = memory.virtual_to_physical(p.data + ninline) + wqe.u32[14] = bswap(tonumber(shr(phy, 32))) + wqe.u32[15] = bswap(tonumber(band(phy, 0xFFFFFFFF))) + -- Advance counters + cxq.next_tx_wqeid = cxq.next_tx_wqeid + 1 + loaded = loaded + 1 + -- Kickoff? + return loaded == cxq.sqsize + end end return sq @@ -1670,6 +1882,7 @@ end PMTU = 0x5003 PTYS = 0x5004 -- Port Type and Speed PAOS = 0x5006 -- Port Administrative & Operational Status +PFCC = 0x5007 -- Port Flow Control Configuration PPCNT = 0x5008 -- Ports Performance Counters PPLR = 0x5018 -- Port Physical Loopback Register @@ -1839,6 +2052,37 @@ function HCA:get_port_stats_finish () return port_stats end +function HCA:set_port_flow_control (rx_enable, tx_enable) + self:command("ACCESS_REGISTER", 0x1C, 0x1C) + :input("opcode", 0x00, 31, 16, 0x805) + :input("opmod", 0x04, 15, 0, 0) -- write + :input("register_id", 0x08, 15, 0, PFCC) + :input("local_port", 0x10, 23, 16, 1) + :input("pptx", 0x10 + 0x08, 31, 31, tx_enable and 1 or 0) + :input("pprx", 0x10 + 0x0C, 31, 31, rx_enable and 1 or 0) + :execute() +end + +local fc_status = {} +function HCA:get_port_flow_control () + self:command("ACCESS_REGISTER", 0x10, 0x1C) + :input("opcode", 0x00, 31, 16, 0x805) + :input("opmod", 0x04, 15, 0, 1) -- read + :input("register_id", 0x08, 15, 0, PFCC) + :input("local_port", 0x10, 23, 16, 1) + :execute() + fc_status.pptx = self:output(0x10 + 0x08, 31, 31) + fc_status.aptx = self:output(0x10 +0x08, 30, 30) + fc_status.pfctx = self:output(0x10 + 0x08, 23, 16) + fc_status.fctx_disabled = self:output(0x10 +0x08, 8, 8) + fc_status.pprx = self:output(0x10 + 0x0c, 31, 31) + fc_status.aprx = self:output(0x10 + 0x0c, 30, 30) + fc_status.pfcrx = self:output(0x10 +0x0c, 23, 16) + fc_status.stall_minor_watermark = self:output(0x10 +0x10, 31, 16) + fc_status.stall_crit_watermark = self:output(0x10 +0x10, 15, 0) + return fc_status +end + function HCA:alloc_q_counter() self:command("ALLOC_Q_COUNTER", 0x18, 0x10C) :input("opcode", 0x00, 31, 16, 0x771) @@ -1849,13 +2093,13 @@ end local q_stats = { out_of_buffer = 0ULL } -function HCA:query_q_counter_start (args) +function HCA:query_q_counter_start (id) self:command("QUERY_Q_COUNTER", 0x20, 0x10C) :input("opcode", 0x00, 31, 16, 0x773) -- Clear the counter after reading. This allows us to -- update the rxdrop stat incrementally. :input("clear", 0x18, 31, 31, 1) - :input("counter_set_id",0x1c, 7, 0, args.set_id) + :input("counter_set_id",0x1c, 7, 0, id) :execute_async() end @@ -2360,8 +2604,8 @@ function selftest () io1.output = { output = link.new('output1') } -- Exercise the IO apps before the NIC is initialized. io0:pull() io0:push() io1:pull() io1:push() - local nic0 = ConnectX:new{pciaddress = pcidev0, queues = {{id='a'}}} - local nic1 = ConnectX:new{pciaddress = pcidev1, queues = {{id='b'}}} + local nic0 = ConnectX:new(lib.parse({pciaddress = pcidev0, queues = {{id='a'}}}, ConnectX.config)) + local nic1 = ConnectX:new(lib.parse({pciaddress = pcidev1, queues = {{id='b'}}}, ConnectX.config)) print("selftest: waiting for both links up") while (nic0.hca:query_vport_state().oper_state ~= 1) or diff --git a/src/apps/mellanox/connectx_test.lua b/src/apps/mellanox/connectx_test.lua index cba693ddcd..7c4d79bafb 100644 --- a/src/apps/mellanox/connectx_test.lua +++ b/src/apps/mellanox/connectx_test.lua @@ -40,8 +40,10 @@ function switch (pci0, pci1, npackets, ncores, minlen, maxlen, minburst, maxburs end end -- Instantiate app network - local nic0 = connectx.ConnectX:new({pciaddress=pci0, queues=queues}) - local nic1 = connectx.ConnectX:new({pciaddress=pci1, queues=queues}) + local nic0 = connectx.ConnectX:new(lib.parse({pciaddress=pci0, queues=queues}, + connectx.ConnectX.config)) + local nic1 = connectx.ConnectX:new(lib.parse({pciaddress=pci1, queues=queues}, + connectx.ConnectX.config)) local io0 = {} -- io apps on nic0 local io1 = {} -- io apps on nic1 print(("creating %d queues per device..."):format(#queues)) @@ -316,6 +318,12 @@ function basic_match (pci0, pci1) engine.configure(c) + print("waiting for linkup...") + lib.waitfor(function () + return engine.app_table.nic0.hca:linkup() + and engine.app_table.nic1.hca:linkup() + end) + engine.main({duration = 1, report = false}) engine.report_links() engine.report_apps() diff --git a/src/apps/packet_filter/pcap_filter.lua b/src/apps/packet_filter/pcap_filter.lua index 612192587a..4e1160f1c6 100644 --- a/src/apps/packet_filter/pcap_filter.lua +++ b/src/apps/packet_filter/pcap_filter.lua @@ -135,8 +135,7 @@ function selftest_run (stateful, expected, tolerance, native) print(("Run for 1 second (stateful = %s)..."):format(stateful)) - local deadline = lib.timeout(1.0) - repeat app.breathe() until deadline() + app.main{duration=1} app.report({showlinks=true}) local sent = link.stats(app.app_table.pcap_filter.input.input).rxpackets diff --git a/src/apps/pcap/tap.lua b/src/apps/pcap/tap.lua index f18171136b..db720cf4bc 100644 --- a/src/apps/pcap/tap.lua +++ b/src/apps/pcap/tap.lua @@ -67,7 +67,7 @@ function selftest () config.link(c, "source.output -> tap.input") config.link(c, "tap.output -> sink.input") app.configure(c) - while not app.app_table.source.done do app.breathe() end + app.main{done=function () return app.app_table.source.done end} local n = 0 for packet, record in pcap.records(tmp) do n = n + 1 end diff --git a/src/apps/rate_limiter/rate_limiter.lua b/src/apps/rate_limiter/rate_limiter.lua index 3b68b65c62..4b02918cc2 100644 --- a/src/apps/rate_limiter/rate_limiter.lua +++ b/src/apps/rate_limiter/rate_limiter.lua @@ -148,11 +148,7 @@ function selftest () local snapshot = rl:get_stat_snapshot() -- push some packets through it - while seconds_to_run > 0 do - app.breathe() - timer.run() - C.usleep(10) -- avoid busy loop - end + app.main{duration=seconds_to_run} -- print final report app.report() @@ -194,10 +190,7 @@ function selftest () rl:reset(rate_busy_loop, bucket_size) local snapshot = rl:get_stat_snapshot() - for i = 1, 100000 do - app.breathe() - timer.run() - end + app.main{duration=0.1} local elapsed_time = (tonumber(C.get_time_ns()) - snapshot.time) / 1e9 print("elapsed time ", elapsed_time, "seconds") diff --git a/src/apps/rss/README.md b/src/apps/rss/README.md index e92b0373b8..ed488e1770 100644 --- a/src/apps/rss/README.md +++ b/src/apps/rss/README.md @@ -51,12 +51,27 @@ the following set of extensions. The output links can be grouped into equivalence classes with respect to matching conditions in terms of arbitrary pflang expressions as -provided by the `pf` module. Matching packets are only distributed to -the output links that belong to the equivalence class. By default, a -single equivalence class exists which matches all packets. It is -special in the sense that the matching condition cannot be expressed -in pflang. This default class is the only one that can receive non-IP -packets. +provided by the `pf` module. Because the current implementation of the +`pf` module does not implement the `vlan` primitive, an auxiliary +construct is needed to match on the VLAN tag if required. Apart from a +regular BPF expression, the `rss` module also accepts a string of the +form + +``` +VLAN ... [ BPF ] +``` + +where `` are numbers representig VLAN IDs. This expression +matches a packet that carries any of the given VLAN tags. If the +expression also contains the keyword `BPF` followed by a regular BPF +expression, the packet must also match that expression to be mapped to +this equivalence class. + +Matching packets are only distributed to the output links that belong +to the equivalence class. By default, a single equivalence class +exists which matches all packets. It is special in the sense that the +matching condition cannot be expressed in pflang. This default class +is the only one that can receive non-IP packets. Classes are specified in an explicit order when an instance of the `rss` app is created. The default class is created implicitly as the diff --git a/src/apps/rss/rss.lua b/src/apps/rss/rss.lua index 24efd2d1cf..225bf32a74 100644 --- a/src/apps/rss/rss.lua +++ b/src/apps/rss/rss.lua @@ -15,6 +15,8 @@ local receive, transmit = link.receive, link.transmit local nreadable = link.nreadable local free, clone = packet.free, packet.clone local mdadd, mdget, mdcopy = metadata.add, metadata.get, metadata.copy +local ether_header_ptr_t = metadata.ether_header_ptr_t + local transport_proto_p = { -- TCP @@ -34,7 +36,8 @@ rss = { shm = { rxpackets = { counter, 0}, rxdrops_filter = { counter, 0} - } + }, + push_link = {} } local class_config = { name = { required = true }, @@ -42,22 +45,71 @@ local class_config = { continue = { default = false } } +local function mk_addr_copy_fn (size) + local str = "return function (dst, src)\n" + for i = 0, size-1 do + str = str..string.format(" dst[%d] = src[%d]\n", i, i) + end + str = str.."end\n" + return loadstring(str)() +end + local hash_info = { -- IPv4 [0x0800] = { addr_offset = 12, - addr_size = 8 + -- 64-bit words + addr_size = 1 }, -- IPv6 [0x86dd] = { addr_offset = 8, - addr_size = 32 + -- 64-bit words + addr_size = 4 }, } +local etht_demux = {} + +function etht_demux:alloc_l2 () + local l2 = ffi.new("struct link *[256]", self.default_queue) + table.insert(self.l2_anchors, l2) + return l2 +end + +local function split (type) + local hi = bit.rshift(type, 8) + local lo = bit.band(type, 0x00FF) + return hi, lo +end + +function etht_demux:add (type, link) + local hi, lo = split(type) + local l2 = self.l1[hi] + if l2 == self.default then + l2 = self:alloc_l2() + self.l1[hi] = l2 + end + l2[lo] = link +end + +function etht_demux:new (default_queue) + local o = setmetatable({}, { __index = etht_demux }) + o.default_queue = default_queue + o.l2_anchors = {} + o.default = o:alloc_l2() + o.l1 = ffi.new("struct link **[256]", o.default) + return o +end + +function etht_demux:lookup (type) + local hi, lo = split(type) + return self.l1[hi][lo] +end + function rss:new (config) local o = { classes = {}, - links_configured = {}, + classes_active = {}, queue = link.new("queue"), rxpackets = 0, rxdrops_filter = 0, @@ -68,7 +120,7 @@ function rss:new (config) for _, info in pairs(hash_info) do info.key_t = ffi.typeof([[ struct { - uint8_t addrs[$]; + uint64_t addrs[$]; uint32_t ports; uint8_t proto; } __attribute__((packed)) @@ -77,16 +129,19 @@ function rss:new (config) info.hash_fn = siphash.make_hash({ size = ffi.sizeof(info.key), key = siphash.random_sip_hash_key() }) + info.copy_addr_fn = mk_addr_copy_fn(info.addr_size) end local function add_class (name, match_fn, continue) assert(name:match("%w+"), "Illegal class name: "..name) + local seq = #o.classes table.insert(o.classes, { name = name, match_fn = match_fn, continue = continue, input = link.new(name), - output = { n = 0 } + output = { n = 0 }, + seq = seq }) end @@ -96,59 +151,136 @@ function rss:new (config) assert(not classes[config.name], "Duplicate filter class: "..config.name) classes[config.name] = true - add_class(config.name, pf.compile_filter(config.filter), - config.continue) + local match_fn + local vlans = config.filter:match("^VLAN (.*)$") + if vlans then + local expr = "" + local pf_fn + for vlan in vlans:split("%s") do + if (vlan:match("^(%d+)$")) then + expr = expr.."md.vlan == "..vlan.." or " + elseif vlan == "BPF" then + local bpf = config.filter:match("BPF (.*)") + pf_fn = pf.compile_filter(bpf) + break + else + error(string.format("illegal VLAN ID in filter expression of ".. + "class %s: %s", class, vlan)) + end + end + expr = expr.."nil" + match_fn = loadstring("return function(md) return "..expr.." end")() + if pf_fn then + match_fn_aux = match_fn + match_fn = function(md) + return match_fn_aux(md) and pf_fn(md.filter_start, md.filter_length) + end + end + else + pf_fn = pf.compile_filter(config.filter) + match_fn = function(md) + return pf_fn(md.filter_start, md.filter_length) + end + end + add_class(config.name, match_fn, config.continue) end if config.default_class then -- Catch-all default filter add_class("default", function () return true end) end + o.demux_queues = {} + + local function add_queue(name) + local queue = link.new(name) + o.demux_queues[name] = queue + return queue + end + + local function add_demux(default, types) + local default_queue = add_queue(default) + local demux = etht_demux:new(default_queue) + for _, type in ipairs(types) do + local queue = add_queue(type.name) + demux:add(type.type, queue) + end + return demux + end + + o.demux1 = add_demux("default_untagged", { + { name = "dot1q", + type = 0x8100, + }, + { name = "ipv4", + type = 0x0800, + }, + { name = "ipv6", + type = 0x86dd, + } + }) + o.demux2 = add_demux("default_tagged", { + { name= "ipv4_tagged", + type = 0x0800, + }, + { name = "ipv6_tagged", + type = 0x86dd, + } + }) + + o.nqueues = #o.demux_queues + return setmetatable(o, { __index = self }) end -function rss:link () - for name, l in pairs(self.output) do - if type(name) == "string" then - if not self.links_configured[name] then - self.links_configured[name] = true - local match = false - for _, class in ipairs(self.classes) do - local instance = name:match("^"..class.name.."_(.*)") - if instance then - match = true - local weight = instance:match("^%w+_(%d+)$") or 1 - for _ = 1, weight do - table.insert(class.output, l) - end - -- Avoid calls to lj_tab_len() in distribute() - class.output.n = #class.output - end - end - if not match then - print("Ignoring link (does not match any filters): "..name) - end - end +local function insert_unique(t, new_elt) + for _, elt in ipairs(t) do + if elt == new_elt then + return end end + table.insert(t, new_elt) +end - self.classes_active = {} - for _, class in ipairs(self.classes) do - if #class.output > 0 then - table.insert(self.classes_active, class) +function rss:link (direction, name) + if direction == 'input' then + local vlan = name:match("^vlan(%d+)$") + if vlan then + vlan = tonumber(vlan) + assert(vlan > 0 and vlan < 4095, "Illegal VLAN id: "..vlan) end - end + self.push_link[name] = function (self, input) + self:push_with_vlan(input, vlan) + end + else + local match = false + for _, class in ipairs(self.classes) do + local instance = name:match("^"..class.name.."_(.*)") + if instance then + match = true + local weight = instance:match("^%w+_(%d+)$") or 1 + for _ = 1, weight do + table.insert(class.output, self.output[name]) + end + -- Avoid calls to lj_tab_len() in distribute() + class.output.n = #class.output - self.input_tagged = {} - for name, link in pairs(self.input) do - if type(name) == "string" then - local vlan = name:match("^vlan(%d+)$") - if vlan then - vlan = tonumber(vlan) - assert(vlan > 0 and vlan < 4095, "Illegal VLAN id: "..vlan) + insert_unique(self.classes_active, class) end - table.insert(self.input_tagged, { link = link, vlan = vlan }) end + -- Preserve order + table.sort(self.classes_active, function (a, b) return a.seq < b.seq end) + + if not match then + print("Ignoring link (does not match any filters): "..name) + end + end +end + +function rss:unlink (direction, name) + if direction == 'input' then + self.push_link[name] = nil + else + -- XXX - undo 'output' case in link()? end end @@ -156,7 +288,7 @@ local function hash (md) local info = hash_info[md.ethertype] local hash = 0 if info then - ffi.copy(info.key.addrs, md.l3 + info.addr_offset, info.addr_size) + info.copy_addr_fn(info.key.addrs, ffi.cast("uint64_t*", md.l3 + info.addr_offset)) if transport_proto_p[md.proto] then info.key.ports = ffi.cast("uint32_t *", md.l4)[0] else @@ -176,17 +308,80 @@ local function distribute (p, links, hash) transmit(links[index], p) end -function rss:push () +local function md_wrapper(self, demux_queue, queue, vlan) + local p = receive(demux_queue) + hash(mdadd(p, self.rm_ext_headers, vlan)) + transmit(queue, p) +end + +function rss:push_with_vlan(link, vlan) + local npackets = nreadable(link) + self.rxpackets = self.rxpackets + npackets local queue = self.queue - for _, input in ipairs(self.input_tagged) do - local link, vlan = input.link, input.vlan - local npackets = nreadable(link) - self.rxpackets = self.rxpackets + npackets + -- Use a do..end blocks here to limit the scopes of locals to avoid + -- "too many spill slots" trace aborts + do + -- Performance tuning: mdadd() needs to be called for every + -- packet. With a mix of tagged/untagged ipv4/ipv6 traffic, that + -- function has a number of unbiased branches, leading to + -- inefficient side traces. We use a branch-free classifier on + -- the Ethertype to separate packets of each type into separate + -- queues and process them in separate loops. In each loop, + -- mdadd() is inlined and compiled for the specifics of that + -- type of packet, which results in 100% biased branches and + -- thus no side traces. Note that for this to work, the loops + -- have to be explicite in the code below to allow the compiler + -- to produce distinct versions of the inlined function. for _ = 1, npackets do local p = receive(link) - hash(mdadd(p, self.rm_ext_headers, vlan)) - transmit(queue, p) + local hdr = ffi.cast(ether_header_ptr_t, p.data) + transmit(self.demux1:lookup(lib.ntohs(hdr.ether.type)), p) + end + + local dot1q = self.demux_queues.dot1q + for _ = 1, nreadable(dot1q) do + local p = receive(dot1q) + local hdr = ffi.cast(ether_header_ptr_t, p.data) + transmit(self.demux2:lookup(lib.ntohs(hdr.dot1q.type)), p) + end + + local demux_queues = self.demux_queues + do + local dqueue = demux_queues.default_untagged + for _ = 1, nreadable(dqueue) do + md_wrapper(self, dqueue, queue, vlan) + end + end + do + local dqueue = demux_queues.default_tagged + for _ = 1, nreadable(dqueue) do + md_wrapper(self, dqueue, queue, vlan) + end + end + do + local dqueue = demux_queues.ipv4 + for _ = 1, nreadable(dqueue) do + md_wrapper(self, dqueue, queue, vlan) + end + end + do + local dqueue = demux_queues.ipv6 + for _ = 1, nreadable(dqueue) do + md_wrapper(self, dqueue, queue, vlan) + end + end + do + local dqueue = demux_queues.ipv4_tagged + for _ = 1, nreadable(dqueue) do + md_wrapper(self, dqueue, queue, vlan) + end + end + do + local dqueue = demux_queues.ipv6_tagged + for _ = 1, nreadable(dqueue) do + md_wrapper(self, dqueue, queue, vlan) + end end end @@ -198,7 +393,7 @@ function rss:push () for _ = 1, nreadable(queue) do local p = receive(queue) local md = mdget(p) - if class.match_fn(md.filter_start, md.filter_length) then + if class.match_fn(md) then md.ref = md.ref + 1 transmit(class.input, p) if class.continue then @@ -231,7 +426,9 @@ function rss:push () end end end +end +function rss:tick() if self.sync_timer() then counter.set(self.shm.rxpackets, self.rxpackets) counter.set(self.shm.rxdrops_filter, self.rxdrops_filter) diff --git a/src/apps/socket/raw.lua b/src/apps/socket/raw.lua index d668248072..de9bab7aa0 100644 --- a/src/apps/socket/raw.lua +++ b/src/apps/socket/raw.lua @@ -52,26 +52,26 @@ function RawSocket:pull () local l = self.output.tx if l == nil then return end local limit = engine.pull_npackets - while limit > 0 and self:can_receive() do + while limit > 0 and self:try_read() do limit = limit - 1 link.transmit(l, self:receive()) end end -function RawSocket:can_receive () - local t, err = S.select({readfds = {self.sock}}, 0) - while not t and (err.AGAIN or err.INTR) do - t, err = S.select({readfds = {self.sock}}, 0) +function RawSocket:try_read () + local rxp = self.rx_p + local bytes = S.read(self.sock, rxp.data, packet.max_payload) + if bytes then + rxp.length = bytes + return true + else + return false end - assert(t, err) - return t.count == 1 end function RawSocket:receive () local p = self.rx_p - local sz = assert(S.read(self.sock, p.data, packet.max_payload)) - p.length = sz - counter.add(self.shm.rxbytes, sz) + counter.add(self.shm.rxbytes, p.length) counter.add(self.shm.rxpackets) if ethernet:is_mcast(p.data) then counter.add(self.shm.rxmcast) @@ -85,34 +85,33 @@ end function RawSocket:push () local l = self.input.rx if l == nil then return end - while not link.empty(l) and self:can_transmit() do - local p = link.receive(l) - self:transmit(p) - counter.add(self.shm.txbytes, p.length) - counter.add(self.shm.txpackets) - if ethernet:is_mcast(p.data) then - counter.add(self.shm.txmcast) + while not link.empty(l) do + local p = link.front(l) + if self:try_transmit(p) then + link.receive(l) + counter.add(self.shm.txbytes, p.length) + counter.add(self.shm.txpackets) + if ethernet:is_mcast(p.data) then + counter.add(self.shm.txmcast) + end + if ethernet:is_bcast(p.data) then + counter.add(self.shm.txbcast) + end + packet.free(p) + else + break end - if ethernet:is_bcast(p.data) then - counter.add(self.shm.txbcast) - end - packet.free(p) end end -function RawSocket:can_transmit () - local t, err = S.select({writefds = {self.sock}}, 0) - while not t and (err.AGAIN or err.INTR) do - t, err = S.select({writefds = {self.sock}}, 0) - end - assert(t, err) - return t.count == 1 -end - -function RawSocket:transmit (p) +function RawSocket:try_transmit (p) local sz, err = S.write(self.sock, p.data, p.length) + if (not sz and err.AGAIN) then + return false + end assert(sz, err) assert(sz == p.length) + return true end function RawSocket:stop() diff --git a/src/apps/test/match.lua b/src/apps/test/match.lua index 57d48f7453..c4697ddf51 100644 --- a/src/apps/test/match.lua +++ b/src/apps/test/match.lua @@ -84,14 +84,16 @@ function selftest() engine.main({duration=0.0001}) assert(#engine.app_table.sink:errors() > 0) - engine.configure(config.new()) + local c = config.new() config.app(c, "sink", Match, {fuzzy=true}) + config.app(c, "src", basic_apps.Source, 8) config.app(c, "comparator", basic_apps.Source, 8) config.app(c, "garbage", basic_apps.Source, 12) config.app(c, "join", basic_apps.Join) config.link(c, "src.output -> join.src") config.link(c, "garbage.output -> join.garbage") config.link(c, "join.output -> sink.rx") + config.link(c, "comparator.output -> sink.comparator") engine.configure(c) engine.main({duration=0.0001}) assert(#engine.app_table.sink:errors() == 0) diff --git a/src/apps/vlan/vlan.lua b/src/apps/vlan/vlan.lua index 422ffb912f..faea4d3dae 100644 --- a/src/apps/vlan/vlan.lua +++ b/src/apps/vlan/vlan.lua @@ -28,7 +28,8 @@ Untagger = { VlanMux = { config = { encapsulation = default_encap, - } + }, + push_link = {} } local tpids = { dot1q = 0x8100, dot1ad = 0x88A8 } @@ -127,62 +128,74 @@ function Untagger:push () end function VlanMux:new (conf) - local o = setmetatable({}, {__index=VlanMux}) + local o = setmetatable({ vlan_links = {} }, {__index=VlanMux}) return new_aux(o, conf) end -function VlanMux:link () - local from_vlans, to_vlans = {}, {} - for name, l in pairs(self.input) do - if string.match(name, "vlan%d+") then - local vid = check_tag(tonumber(string.sub(name, 5))) - to_vlans[vid] = self.output[name] - table.insert(from_vlans, { link = l, vid = vid }) - elseif name == "native" then - to_vlans[0] = self.output.native - elseif type(name) == "string" and name ~= "trunk" then - error("invalid link name "..name) - end +function VlanMux:link (dir, name) + local vid = self:link_vid(dir, name) + if dir == 'output' and vid then + self.vlan_links[vid] = self[dir][name] + elseif dir == 'input' and vid then + local tag = build_tag(vid, self.tpid) + self.push_link[name] = self:make_push_from_vlan(tag) + end +end + +function VlanMux:unlink (dir, name) + local vid = self:link_vid(dir, name) + if dir == 'output' and vid then + self.vlan_links[vid] = nil + end +end + +function VlanMux:link_vid (dir, name) + local vid = name:match("vlan(%d+)") + if vid then + return check_tag(tonumber(vid)) + elseif name == 'native' then + return ({output=0, input=nil})[dir] + elseif name == 'trunk' then + return nil + else + error("invalid link name "..name) + end +end + +function VlanMux:make_push_from_vlan (tag) + return function (self, lin) + self:push_from_vlan(lin, tag) end - self.from_vlans = from_vlans - self.to_vlans = to_vlans end -function VlanMux:push () - local from, to = self.from_vlans, self.to_vlans +function VlanMux:push_from_vlan (lin, tag) + local otrunk = assert(self.output.trunk) + for _ = 1, link.nreadable(lin) do + self:transmit(otrunk, push_tag(receive(lin), tag)) + end +end + +function VlanMux.push_link:native (lin) + local otrunk = assert(self.output.trunk) + for _ = 1, link.nreadable(lin) do + self:transmit(otrunk, receive(lin)) + end +end + +function VlanMux.push_link:trunk (itrunk) + local links = self.vlan_links local tpid = self.tpid - local l_in = self.input.trunk - assert(l_in) - while not empty(l_in) do - local p = receive(l_in) + for _ = 1, link.nreadable(itrunk) do + local p = receive(itrunk) local ethertype = cast("uint16_t*", p.data + o_ethernet_ethertype)[0] if ethertype == htons(tpid) then -- dig out TCI field local tci = extract_tci(p) local vid = tci_to_vid(tci) - self:transmit(to[vid], pop_tag(p)) + self:transmit(links[vid], pop_tag(p)) else -- untagged, send to native output - self:transmit(to[0], p) - end - end - - local l_out = self.output.trunk - local i = 1 - while from[i] do - local from = from[i] - local l_in = from.link - while not empty(l_in) do - local p = receive(l_in) - self:transmit(l_out, push_tag(p, build_tag(from.vid, tpid))) - end - i = i + 1 - end - - local l_in = self.input.native - if l_in then - while not empty(l_in) do - self:transmit(l_out, receive(l_in)) + self:transmit(links[0], p) end end end @@ -241,15 +254,17 @@ function selftest() app.configure(c) app.main({duration = 1}) - print("vlan sent: " - ..link.stats(app.app_table.vlan_source.output.output).txpackets) - print("native sent: " - ..link.stats(app.app_table.native_source.output.output).txpackets) - print("trunk received: " - ..link.stats(app.app_table.trunk_sink.input.input).rxpackets) - print("trunk sent: " - ..link.stats(app.app_table.trunk_source.output.output).txpackets) - print("native received: " - ..link.stats(app.app_table.native_sink.input.input).rxpackets) + local vsent = link.stats(app.app_table.vlan_source.output.output).txpackets + local nsent = link.stats(app.app_table.native_source.output.output).txpackets + local trecv = link.stats(app.app_table.trunk_sink.input.input).rxpackets + local tsent = link.stats(app.app_table.trunk_source.output.output).txpackets + local nrecv = link.stats(app.app_table.native_sink.input.input).rxpackets + print("vlan sent: "..vsent) + print("native sent: "..nsent) + print("trunk received: "..trecv) + assert(trecv == vsent + nsent) + print("trunk sent: "..tsent) + print("native received: "..nrecv) + assert(nrecv == tsent) test_tag_untag() end diff --git a/src/core/app.lua b/src/core/app.lua index f4059d6d30..11825ce0b2 100644 --- a/src/core/app.lua +++ b/src/core/app.lua @@ -21,7 +21,6 @@ pull_npackets = math.floor(link.max / 10) -- Set to true to enable logging log = false -local use_restart = false test_skipped_code = 43 @@ -74,6 +73,22 @@ maxsleep = 100 -- loop (100% CPU) instead of sleeping according to the Hz setting. busywait = false +-- tick_Hz: Frequency at which to execute tick() methods ( per second) +tick_Hz = 1000 + +local tick, tick_current_freq +function enable_tick (freq) + freq = freq or tick_Hz + if freq == tick_current_freq then + return + end + if freq > 0 then + tick = lib.throttle(1/freq) + else + tick = function () return false end + end +end + -- Profiling with vmprofile -------------------------------- vmprofile_enabled = true @@ -121,56 +136,6 @@ function now () return (running and monotonic_now) or C.get_monotonic_time() end --- Run app:methodname() in protected mode (pcall). If it throws an --- error app will be marked as dead and restarted eventually. -function with_restart (app, method) - local status, result - setvmprofile(app.zone) - if use_restart then - -- Run fn in protected mode using pcall. - status, result = pcall(method, app) - - -- If pcall caught an error mark app as "dead" (record time and cause - -- of death). - if not status then - app.dead = { error = result, time = now() } - end - else - status, result = true, method(app) - end - setvmprofile("engine") - return status, result -end - --- Restart dead apps. -function restart_dead_apps () - if not use_restart then return end - local restart_delay = 2 -- seconds - local actions = {} - - for name, app in pairs(app_table) do - if app.dead and (now() - app.dead.time) >= restart_delay then - io.stderr:write(("Restarting %s (died at %f: %s)\n") - :format(name, app.dead.time, app.dead.error)) - local info = configuration.apps[name] - table.insert(actions, {'stop_app', {name}}) - table.insert(actions, {'start_app', {name, info.class, info.arg}}) - for linkspec in pairs(configuration.links) do - local fa, fl, ta, tl = config.parse_link(linkspec) - if fa == name then - table.insert(actions, {'link_output', {fa, fl, linkspec}}) - end - if ta == name then - table.insert(actions, {'link_input', {ta, tl, linkspec}}) - end - end - end - end - - -- Restart dead apps if necessary. - if #actions > 0 then apply_config_actions(actions) end -end - -- Configure the running app network to match new_configuration. -- -- Successive calls to configure() will migrate from the old to the @@ -349,14 +314,16 @@ function apply_config_actions (actions) local link = app.output[linkname] app.output[linkname] = nil remove_link_from_array(app.output, link) - if app.link then app:link() end + if app.unlink then app:unlink('output', linkname) + elseif app.link then app:link('output', linkname) end end function ops.unlink_input (appname, linkname) local app = app_table[appname] local link = app.input[linkname] app.input[linkname] = nil remove_link_from_array(app.input, link) - if app.link then app:link() end + if app.unlink then app:unlink('input', linkname) + elseif app.link then app:link('input', linkname) end end function ops.free_link (linkspec) link.free(link_table[linkspec], linkspec) @@ -370,16 +337,22 @@ function apply_config_actions (actions) function ops.link_output (appname, linkname, linkspec) local app = app_table[appname] local link = assert(link_table[linkspec]) + assert(not app.output[linkname], + appname..": duplicate output link "..linkname) app.output[linkname] = link table.insert(app.output, link) - if app.link then app:link() end + if app.link then app:link('output', linkname) end end function ops.link_input (appname, linkname, linkspec) local app = app_table[appname] local link = assert(link_table[linkspec]) + assert(not app.input[linkname], + appname..": duplicate input link "..linkname) app.input[linkname] = link table.insert(app.input, link) - if app.link then app:link() end + if app.link then + app:link('input', linkname) + end end function ops.stop_app (name) local app = app_table[name] @@ -404,6 +377,16 @@ function apply_config_actions (actions) app.shm.dtime = {counter, C.get_unix_time()} app.shm = shm.create_frame("apps/"..name, app.shm) end + if class.push_link then + if type(class.push_link) ~= 'table' then + error(("bad push_link value for app '%s' (must be a table)") + :format(name)) + end + app.push_link = {} + for name, method in pairs(class.push_link) do + app.push_link[name] = method + end + end configuration.apps[name] = { class = class, arg = arg } end function ops.reconfig_app (name, class, arg) @@ -445,13 +428,17 @@ function tsort (nodes, entries, successors) return ret end -breathe_pull_order = {} -breathe_push_order = {} +local breathe_pull_order = {} +local breathe_push_order = {} +local breathe_ticks = {} -- Sort the links in the app graph, and arrange to run push() on the -- apps on the receiving ends of those links. This will run app:push() -- once for each link, which for apps with multiple links may cause the -- app's push function to run multiple times in a breath. +-- +-- Also collect tick methods that need to be run on tick breaths in +-- deterministic order. function compute_breathe_order () breathe_pull_order, breathe_push_order = {}, {} local pull_links, inputs, successors = {}, {}, {} @@ -468,11 +455,17 @@ function compute_breathe_order () end end for linkname,link in pairs(app.input) do - linknames[link] = appname..'.'..linkname - inputs[link] = app + -- NB: each link is indexed by number and by name. + if type(linkname) == 'string' then + linknames[link] = appname..'.'..linkname + local push_link = app.push_link and app.push_link[linkname] + local push = push_link or app.push + inputs[link] = { app = app, push = push, link = link } + end end end - for link,app in pairs(inputs) do + for link,spec in pairs(inputs) do + local app = spec.app successors[link] = {} if not app.pull then for _,succ in pairs(app.output) do @@ -500,12 +493,22 @@ function compute_breathe_order () table.sort(successors[link], cmp_links) end local link_order = tsort(nodes, entry_nodes, successors) - local i = 1 for _,link in ipairs(link_order) do - if breathe_push_order[#breathe_push_order] ~= inputs[link] then - table.insert(breathe_push_order, inputs[link]) + local spec = inputs[link] + local prev = breathe_push_order[#breathe_push_order] + if spec.push then + if not prev or prev.app ~= spec.app or prev.push ~= spec.push then + table.insert(breathe_push_order, spec) + end end end + breathe_ticks = {} + for _,app in pairs(app_table) do + if app.tick then + table.insert(breathe_ticks, app) + end + end + table.sort(breathe_ticks, cmp_apps) end -- Call this to "run snabb switch". @@ -532,6 +535,9 @@ function main (options) breathe = latency:wrap_thunk(breathe, now) end + -- Enable tick + enable_tick() + monotonic_now = C.get_monotonic_time() repeat breathe() @@ -575,16 +581,14 @@ end function breathe () running = true monotonic_now = C.get_monotonic_time() - -- Restart: restart dead apps - restart_dead_apps() -- Inhale: pull work into the app network local i = 1 ::PULL_LOOP:: do - if i > #breathe_pull_order then goto PULL_EXIT end - local app = breathe_pull_order[i] - if app.pull and not app.dead then - with_restart(app, app.pull) + if i > #breathe_pull_order then goto PULL_EXIT else + local app = breathe_pull_order[i] + setvmprofile(app.zone) + app:pull() end i = i+1 goto PULL_LOOP @@ -594,20 +598,27 @@ function breathe () i = 1 ::PUSH_LOOP:: do - if i > #breathe_push_order then goto PUSH_EXIT end - local app = breathe_push_order[i] - if app.push and not app.dead then - with_restart(app, app.push) + if i > #breathe_push_order then goto PUSH_EXIT else + local spec = breathe_push_order[i] + local app, push, link = spec.app, spec.push, spec.link + setvmprofile(app.zone) + push(app, link) end i = i+1 goto PUSH_LOOP end ::PUSH_EXIT:: + -- Tick: call tick() methods at tick_Hz frequency + if tick() then + for _, app in ipairs(breathe_ticks) do + app:tick() + end + end + setvmprofile("engine") counter.add(breaths) - -- Commit counters and rebalance freelists at a reasonable frequency + -- Commit counters at a reasonable frequency if counter.read(breaths) % 100 == 0 then counter.commit() - packet.rebalance_freelists() end running = false end @@ -686,35 +697,28 @@ end function report_apps () print ("apps report:") for name, app in pairs(app_table) do - if app.dead then - print(name, ("[dead: %s]"):format(app.dead.error)) - elseif app.report then + if app.report then + setvmprofile(app.zone) print(name) - if use_restart then - with_restart(app, app.report) - else - -- Restarts are disabled, still we want to not die on - -- errors during app reports, thus this workaround: - local status, err = pcall(app.report, app) - if not status then - print("Warning: "..name.." threw an error during report: "..err) - end - end + app:report() end end + setvmprofile("engine") end function selftest () print("selftest: app") - local App = { push = true } + local App = {} function App:new () return setmetatable({}, {__index = App}) end + function App:pull () end + function App:push () end local c1 = config.new() config.app(c1, "app1", App) config.app(c1, "app2", App) config.link(c1, "app1.x -> app2.x") print("empty -> c1") configure(c1) - assert(#breathe_pull_order == 0) + assert(#breathe_pull_order == 2) assert(#breathe_push_order == 1) assert(app_table.app1 and app_table.app2) local orig_app1 = app_table.app1 @@ -732,7 +736,7 @@ function selftest () config.link(c2, "app2.x -> app1.x") print("c1 -> c2") configure(c2) - assert(#breathe_pull_order == 0) + assert(#breathe_pull_order == 2) assert(#breathe_push_order == 2) assert(app_table.app1 ~= orig_app1) -- should be restarted assert(app_table.app2 == orig_app2) -- should be the same @@ -742,7 +746,7 @@ function selftest () configure(c1) -- c2 -> c1 assert(app_table.app1 ~= orig_app1) -- should be restarted assert(app_table.app2 == orig_app2) -- should be the same - assert(#breathe_pull_order == 0) + assert(#breathe_pull_order == 2) assert(#breathe_push_order == 1) print("c1 -> empty") configure(config.new()) @@ -759,45 +763,106 @@ function selftest () assert(not pcall(config.app, c3, "app_invalid", AppC)) assert(not pcall(config.app, c3, "app_invalid", AppC, {b="bar"})) assert(not pcall(config.app, c3, "app_invalid", AppC, {a="bar", c="foo"})) --- Test app restarts on failure. - use_restart = true - print("c_fail") - local App1 = {zone="test"} - function App1:new () return setmetatable({}, {__index = App1}) end - function App1:pull () error("Pull error.") end - function App1:push () return true end - function App1:report () return true end - local App2 = {zone="test"} - function App2:new () return setmetatable({}, {__index = App2}) end - function App2:pull () return true end - function App2:push () error("Push error.") end - function App2:report () return true end - local App3 = {zone="test"} - function App3:new () return setmetatable({}, {__index = App3}) end - function App3:pull () return true end - function App3:push () return true end - function App3:report () error("Report error.") end - local c_fail = config.new() - config.app(c_fail, "app1", App1) - config.app(c_fail, "app2", App2) - config.app(c_fail, "app3", App3) - config.link(c_fail, "app1.x -> app2.x") - configure(c_fail) - local orig_app1 = app_table.app1 - local orig_app2 = app_table.app2 - local orig_app3 = app_table.app3 - main({duration = 4, report = {showapps = true}}) - assert(app_table.app1 ~= orig_app1) -- should be restarted - assert(app_table.app2 ~= orig_app2) -- should be restarted - assert(app_table.app3 == orig_app3) -- should be the same - main({duration = 4, report = {showapps = true}}) - assert(app_table.app3 ~= orig_app3) -- should be restarted -- Check engine stop + local c4 = config.new() + config.app(c4, "app1", App) + engine.configure(c4) assert(not lib.equal(app_table, {})) engine.stop() assert(lib.equal(app_table, {})) + -- Test tick() + local TickApp = {} + function TickApp:new () return setmetatable({ticks=0}, {__index = TickApp}) end + function TickApp:tick () self.ticks = self.ticks + 1 end + local c5 = config.new() + config.app(c5, "app_tick", TickApp) + engine.configure(c5) + local t = 0.1 + engine.main{duration=t} + local expected_ticks = t * tick_Hz + local ratio = app_table.app_tick.ticks / expected_ticks + assert(ratio >= 0.9 and ratio <= 1.1) + print("ticks: actual/expected = "..ratio) + + -- Test link() 3.0 + local LinkApp = {push_link={}} + function LinkApp:new () + local self = {linked={input={}, output={}}, called={}, pushed=false} + return setmetatable(self, {__index = LinkApp}) + end + function LinkApp:link (dir, name) + print('link', dir, name) + self.linked[dir][name] = assert(self[dir][name]) + if dir == 'input' then + self.push_link[name] = function (self, input) + print('push_link', name, input) + self.called[name] = true + end + end + end + function LinkApp:unlink (dir, name) + print('unlink', dir, name) + assert(not self[dir][name]) + self.linked[dir][name] = nil + end + function LinkApp:push () + self.pushed = true + end + local c6 = config.new() + config.app(c6, "app_pull", App) + config.app(c6, "link_app", LinkApp) + config.link(c6, "app_pull.output -> link_app.input") + engine.configure(c6) + assert(#breathe_pull_order == 1) + assert(#breathe_push_order == 1) + engine.main{done=function () return true end} + assert(app_table.link_app.linked.input.input) + assert(app_table.link_app.called.input) + assert(not app_table.link_app.pushed) + local c7 = config.new() + config.app(c7, "app_pull", App) + config.app(c7, "link_app", LinkApp) + engine.configure(c7) + assert(not app_table.link_app.linked.input.input) + -- Backwards compatible? + local LegacyApp = {push_link={}} + function LegacyApp:new () + local self = {linked={input={}, output={}}, called={}, pushed=false} + return setmetatable(self, {__index = LegacyApp}) + end + function LegacyApp:link (dir, name) + print('link', dir, name) + self.linked[dir][name] = self[dir][name] + end + function LegacyApp.push_link:newstyle (input) + print('push_link', 'newstyle', input) + self.called.newstyle = true + end + function LegacyApp:push () + self.pushed = true + end + local c8 = config.new() + config.app(c8, "app_pull", App) + config.app(c8, "link_app", LegacyApp) + config.link(c8, "app_pull.output -> link_app.input") + config.link(c8, "app_pull.output2 -> link_app.newstyle") + engine.configure(c8) + assert(#breathe_pull_order == 1) + assert(#breathe_push_order == 2) + engine.main{done=function () return true end} + assert(app_table.link_app.linked.input.input) + assert(app_table.link_app.linked.input.newstyle) + assert(app_table.link_app.called.newstyle) + assert(app_table.link_app.pushed) + local c9 = config.new() + config.app(c9, "app_pull", App) + config.app(c9, "link_app", LegacyApp) + engine.configure(c9) + assert(not app_table.link_app.linked.input.input) + assert(not app_table.link_app.linked.input.newstyle) + -- Check one can't unclaim a name if no name is claimed. assert(not pcall(unclaim_name)) diff --git a/src/core/clib.h b/src/core/clib.h index 33906c3b49..de2496fbb3 100644 --- a/src/core/clib.h +++ b/src/core/clib.h @@ -27,6 +27,9 @@ void *memmove(void *dest, const void *src, int n); // strncpy(3) - copy a string char *strncpy(char *dest, const char *src, size_t n); +// strncasecmp(3) - compare two strings ignoring case +int strncasecmp(const char *s1, const char *s2, size_t n); + // read(2) - read from a file descriptor int read(int fd, void *buf, size_t count); diff --git a/src/core/group_freelist.lua b/src/core/group_freelist.lua new file mode 100644 index 0000000000..142a2fbc18 --- /dev/null +++ b/src/core/group_freelist.lua @@ -0,0 +1,163 @@ +-- Use of this source code is governed by the Apache 2.0 license; see COPYING. + +module(...,package.seeall) + +local sync = require("core.sync") +local shm = require("core.shm") +local lib = require("core.lib") +local ffi = require("ffi") + +local waitfor, compiler_barrier = lib.waitfor, lib.compiler_barrier +local band = bit.band + +-- Group freelist: lock-free multi-producer multi-consumer ring buffer +-- (mpmc queue) +-- +-- https://www.1024cores.net/home/lock-free-algorithms/queues/bounded-mpmc-queue +-- +-- NB: assumes 32-bit wide loads/stores are atomic (as is the fact on x86_64)! + +-- Group freelist holds up to SIZE chunks of chunksize packets each +chunksize = 2048 + +-- (SIZE=1024)*(chunksize=2048) == roughly two million packets +local SIZE = 1024 -- must be a power of two +local MAX = SIZE - 1 + +local CACHELINE = 64 -- XXX - make dynamic +local INT = ffi.sizeof("uint32_t") + +ffi.cdef([[ +struct group_freelist_chunk { + uint32_t sequence[1], nfree; + struct packet *list[]]..chunksize..[[]; +} __attribute__((packed))]]) + +ffi.cdef([[ +struct group_freelist { + uint32_t enqueue_pos[1]; + uint8_t pad_enqueue_pos[]]..CACHELINE-1*INT..[[]; + + uint32_t dequeue_pos[1]; + uint8_t pad_dequeue_pos[]]..CACHELINE-1*INT..[[]; + + struct group_freelist_chunk chunk[]]..SIZE..[[]; + + uint32_t state[1]; +} __attribute__((packed, aligned(]]..CACHELINE..[[)))]]) + +-- Group freelists states +local CREATE, INIT, READY = 0, 1, 2 + +function freelist_create (name) + local fl = shm.create(name, "struct group_freelist") + if sync.cas(fl.state, CREATE, INIT) then + for i = 0, MAX do + fl.chunk[i].sequence[0] = i + end + fl.state[0] = READY + else + waitfor(function () return fl.state[0] == READY end) + end + return fl +end + +function freelist_open (name, readonly) + local fl = shm.open(name, "struct group_freelist", readonly) + waitfor(function () return fl.state[0] == READY end) + return fl +end + +local function mask (i) + return band(i, MAX) +end + +function start_add (fl) + local pos = fl.enqueue_pos[0] + while true do + local chunk = fl.chunk[mask(pos)] + local seq = chunk.sequence[0] + local dif = seq - pos + if dif == 0 then + if sync.cas(fl.enqueue_pos, pos, pos+1) then + return chunk, pos+1 + end + elseif dif < 0 then + return + else + compiler_barrier() -- ensure fresh load of enqueue_pos + pos = fl.enqueue_pos[0] + end + end +end + +function start_remove (fl) + local pos = fl.dequeue_pos[0] + while true do + local chunk = fl.chunk[mask(pos)] + local seq = chunk.sequence[0] + local dif = seq - (pos+1) + if dif == 0 then + if sync.cas(fl.dequeue_pos, pos, pos+1) then + return chunk, pos+MAX+1 + end + elseif dif < 0 then + return + else + compiler_barrier() -- ensure fresh load of dequeue_pos + pos = fl.dequeue_pos[0] + end + end +end + +function finish (chunk, seq) + chunk.sequence[0] = seq +end + +function selftest () + local fl = freelist_create("test_freelist") + assert(not start_remove(fl)) -- empty + + local w1, sw1 = start_add(fl) + local w2, sw2 = start_add(fl) + assert(not start_remove(fl)) -- empty + finish(w2, sw2) + assert(not start_remove(fl)) -- empty + finish(w1, sw1) + local r1, sr1 = start_remove(fl) + assert(r1 == w1) + local r2, sr2 = start_remove(fl) + assert(r2 == w2) + assert(not start_remove(fl)) -- empty + finish(r1, sr1) + finish(r2, sr2) + assert(not start_remove(fl)) -- empty + + for i=1,SIZE do + local w, sw = start_add(fl) + assert(w) + finish(w, sw) + end + assert(not start_add(fl)) -- full + for i=1,SIZE do + local r, sr = start_remove(fl) + assert(r) + finish(r, sr) + end + assert(not start_remove(fl)) -- empty + + local w = {} + for _=1,10000 do + for _=1,math.random(SIZE) do + local w1, sw = start_add(fl) + if not w1 then break end + finish(w1, sw) + table.insert(w, w1) + end + for _=1,math.random(#w) do + local r, sr = start_remove(fl) + assert(r == table.remove(w, 1)) + finish(r, sr) + end + end +end \ No newline at end of file diff --git a/src/core/packet.lua b/src/core/packet.lua index ca71a3e547..f282357f91 100644 --- a/src/core/packet.lua +++ b/src/core/packet.lua @@ -12,10 +12,11 @@ local lib = require("core.lib") local memory = require("core.memory") local shm = require("core.shm") local counter = require("core.counter") -local sync = require("core.sync") require("core.packet_h") +local group_freelist = require("core.group_freelist") + local packet_t = ffi.typeof("struct packet") local packet_ptr_t = ffi.typeof("struct packet *") local packet_size = ffi.sizeof(packet_t) @@ -55,9 +56,8 @@ local max_packets = 1e6 ffi.cdef([[ struct freelist { - int32_t lock[1]; - uint64_t nfree; - uint64_t max; + int nfree; + int max; struct packet *list[]]..max_packets..[[]; }; ]]) @@ -98,14 +98,6 @@ local function freelist_nfree(freelist) return freelist.nfree end -local function freelist_lock(freelist) - sync.lock(freelist.lock) -end - -local function freelist_unlock(freelist) - sync.unlock(freelist.lock) -end - local packet_allocation_step = 1000 local packets_allocated = 0 -- Initialized on demand. @@ -119,19 +111,39 @@ end -- Call to ensure group freelist is enabled. function enable_group_freelist () if not group_fl then - group_fl = freelist_create("group/packets.freelist") + group_fl = group_freelist.freelist_create("group/packets.freelist") end end +-- Cache group_freelist.chunksize +local group_fl_chunksize = group_freelist.chunksize + -- Return borrowed packets to group freelist. -function rebalance_freelists () - if group_fl and freelist_nfree(packets_fl) > packets_allocated then - freelist_lock(group_fl) - while freelist_nfree(packets_fl) > packets_allocated - and not freelist_full(group_fl) do - freelist_add(group_fl, freelist_remove(packets_fl)) +function rebalance_step () + local chunk, seq = group_freelist.start_add(group_fl) + if chunk then + chunk.nfree = group_fl_chunksize + for i=0, chunk.nfree-1 do + chunk.list[i] = freelist_remove(packets_fl) + end + group_freelist.finish(chunk, seq) + else + error("group freelist overflow") + end +end + +function need_rebalance () + return freelist_nfree(packets_fl) >= (packets_allocated + group_fl_chunksize) +end + +-- Reclaim packets from group freelist. +function reclaim_step () + local chunk, seq = group_freelist.start_remove(group_fl) + if chunk then + for i=0, chunk.nfree-1 do + freelist_add(packets_fl, chunk.list[i]) end - freelist_unlock(group_fl) + group_freelist.finish(chunk, seq) end end @@ -142,19 +154,14 @@ shm.register( {open = function (name) return shm.open(name, "struct freelist") end} ) ffi.metatype("struct freelist", {__tostring = function (freelist) - return ("%d/%d"):format(tonumber(freelist.nfree), tonumber(freelist.max)) + return ("%d/%d"):format(freelist.nfree, freelist.max) end}) -- Return an empty packet. function allocate () if freelist_nfree(packets_fl) == 0 then if group_fl then - freelist_lock(group_fl) - while freelist_nfree(group_fl) > 0 - and freelist_nfree(packets_fl) < packets_allocated do - freelist_add(packets_fl, freelist_remove(group_fl)) - end - freelist_unlock(group_fl) + reclaim_step() end if freelist_nfree(packets_fl) == 0 then preallocate_step() @@ -169,15 +176,19 @@ end -- process termination. function shutdown (pid) local in_group, group_fl = pcall( - freelist_open, "/"..pid.."/group/packets.freelist" + group_freelist.freelist_open, "/"..pid.."/group/packets.freelist" ) if in_group then local packets_fl = freelist_open("/"..pid.."/engine/packets.freelist") - freelist_lock(group_fl) while freelist_nfree(packets_fl) > 0 do - freelist_add(group_fl, freelist_remove(packets_fl)) + local chunk, seq = group_freelist.start_add(group_fl) + assert(chunk, "group freelist overflow") + chunk.nfree = math.min(group_fl_chunksize, freelist_nfree(packets_fl)) + for i=0, chunk.nfree-1 do + chunk.list[i] = freelist_remove(packets_fl) + end + group_freelist.finish(chunk, seq) end - freelist_unlock(group_fl) end end @@ -282,6 +293,9 @@ local free_internal, account_free = function free (p) account_free(p) free_internal(p) + if group_fl and need_rebalance() then + rebalance_step() + end end -- Set packet data length. @@ -293,7 +307,8 @@ function resize (p, len) end function preallocate_step() - assert(packets_allocated + packet_allocation_step <= max_packets, + assert(packets_allocated + packet_allocation_step + <= max_packets - group_fl_chunksize, "packet allocation overflow") for i=1, packet_allocation_step do diff --git a/src/lib/README.ctable.md b/src/lib/README.ctable.md index 2e3d3e2fc6..fbfd8935fa 100644 --- a/src/lib/README.ctable.md +++ b/src/lib/README.ctable.md @@ -55,6 +55,10 @@ Optional entries that may be present in the *parameters* table include: * `resize_callback`: An optional function that is called after the table has been resized. The function is called with two arguments: the ctable object and the old size. By default, no callback is used. + * `max_displacement_limit`: An upper limit to extra slots allocated + for displaced entries. By default we allocate `size*2` slots. + If you carefully read *ctable.lua* you can set this to say 30 and + thereby reduce memory usage to `size+2*30` slots. — Function **ctable.load** *stream* *parameters* diff --git a/src/lib/ctable.lua b/src/lib/ctable.lua index fb575efdc5..de83e67c5b 100644 --- a/src/lib/ctable.lua +++ b/src/lib/ctable.lua @@ -128,7 +128,13 @@ local optional_params = { initial_size = 8, max_occupancy_rate = 0.9, min_occupancy_rate = 0.0, - resize_callback = false + resize_callback = false, + -- The default value for max_displacement_limit is infinity. + -- This is safe but uses lots of memory. An alternative + -- known-to-be-reasonable, virtually-infinite-in-practice value is: 30. + -- In practice, users of lib.ctable can use a lower max_displacement + -- to limit memory usage. See CTable:resize(). + max_displacement_limit = 1/0 } function new(params) @@ -147,9 +153,11 @@ function new(params) ctab.size = 0 ctab.max_displacement = 0 ctab.occupancy = 0 + ctab.lookup_helpers = {} ctab.max_occupancy_rate = params.max_occupancy_rate ctab.min_occupancy_rate = params.min_occupancy_rate ctab.resize_callback = params.resize_callback + ctab.max_displacement_limit = params.max_displacement_limit ctab = setmetatable(ctab, { __index = CTable }) ctab:reseed_hash_function(params.hash_seed) ctab:resize(params.initial_size) @@ -220,16 +228,25 @@ function CTable:resize(size) local old_size = self.size local old_max_displacement = self.max_displacement - -- Allocate double the requested number of entries to make sure there - -- is sufficient displacement if all hashes map to the last bucket. - self.entries, self.byte_size = calloc(self.entry_type, size * 2) + -- Theoretically, all hashes can map to the last bucket and + -- max_displacement could become as large as the table size. To be + -- safe, we should allocate twice as many entries as the size of + -- the table. In practice, max_displacement is expected to always + -- be a small number. We use max_displacement_limit as a cap for + -- this value that "should be enough for everyone". This is not + -- entirely safe, since an overrun can occur before the check for + -- the cap in maybe_increase_max_displacement(). The factor 2 here + -- reduces that risk but does not eliminate it. + local alloc_size = math.min(size*2, size + 2 * self.max_displacement_limit) + self.entries, self.byte_size = calloc(self.entry_type, alloc_size) self.size = size self.scale = self.size / HASH_MAX self.occupancy = 0 self.max_displacement = 0 + self.lookup_helper = self:make_lookup_helper() self.occupancy_hi = ceil(self.size * self.max_occupancy_rate) self.occupancy_lo = floor(self.size * self.min_occupancy_rate) - for i=0,self.size*2-1 do self.entries[i].hash = HASH_MAX end + for i=0,alloc_size-1 do self.entries[i].hash = HASH_MAX end if old_size ~= 0 then self:reseed_hash_function() end @@ -269,7 +286,7 @@ function load(stream, params) params_copy.max_occupancy_rate = header.max_occupancy_rate local ctab = new(params_copy) ctab.occupancy = header.occupancy - ctab.max_displacement = header.max_displacement + ctab:maybe_increase_max_displacement(header.max_displacement) local entry_count = ctab.size + ctab.max_displacement -- Slurp the entries directly into the ctable's backing store. @@ -289,6 +306,23 @@ function CTable:save(stream) self.size + self.max_displacement) end +function CTable:make_lookup_helper() + local entries_per_lookup = self.max_displacement + 1 + local search = self.lookup_helpers[entries_per_lookup] + if search == nil then + search = binary_search.gen(entries_per_lookup, self.entry_type) + self.lookup_helpers[entries_per_lookup] = search + end + return search +end + +function CTable:maybe_increase_max_displacement(displacement) + if displacement <= self.max_displacement then return end + assert(displacement <= self.max_displacement_limit) + self.max_displacement = displacement + self.lookup_helper = self:make_lookup_helper() +end + function CTable:add(key, value, updates_allowed) if self.occupancy + 1 > self.occupancy_hi then -- Note that resizing will invalidate all hash keys, so we need @@ -333,7 +367,7 @@ function CTable:add(key, value, updates_allowed) assert(updates_allowed ~= 'required', "key not found in ctable") - self.max_displacement = max(self.max_displacement, index - start_index) + self:maybe_increase_max_displacement(index - start_index) if entries[index].hash ~= HASH_MAX then -- In a robin hood hash, we seek to spread the wealth around among @@ -349,7 +383,7 @@ function CTable:add(key, value, updates_allowed) while empty > index do entries[empty] = entries[empty - 1] local displacement = empty - hash_to_index(entries[empty].hash, scale) - self.max_displacement = max(self.max_displacement, displacement) + self:maybe_increase_max_displacement(displacement) empty = empty - 1; end end @@ -369,22 +403,24 @@ end function CTable:lookup_ptr(key) local hash = self.hash_fn(key) local entry = self.entries + hash_to_index(hash, self.scale) + entry = self.lookup_helper(entry, hash) - -- Fast path in case we find it directly. - if hash == entry.hash and self.equal_fn(key, entry.key) then - return entry - end - - while entry.hash < hash do entry = entry + 1 end - - while entry.hash == hash do + if hash == entry.hash then + -- Peel the first iteration of the loop; collisions will be rare. if self.equal_fn(key, entry.key) then return entry end - -- Otherwise possibly a collision. entry = entry + 1 + if entry.hash ~= hash then return nil end + while entry.hash == hash do + if self.equal_fn(key, entry.key) then return entry end + -- Otherwise possibly a collision. + entry = entry + 1 + end + -- Not found. + return nil + else + -- Not found. + return nil end - - -- Not found. - return nil end function CTable:lookup_and_copy(key, entry) diff --git a/src/lib/hardware/pci.lua b/src/lib/hardware/pci.lua index ab1714be03..f45b182ff8 100644 --- a/src/lib/hardware/pci.lua +++ b/src/lib/hardware/pci.lua @@ -186,7 +186,19 @@ end function map_pci_memory (f) local st = assert(f:stat()) - local mem = assert(f:mmap(nil, st.size, "read, write", "shared", 0)) + local mem, err = f:mmap(nil, st.size, "read, write", "shared", 0) + -- mmap() returns EINVAL on Linux >= 4.5 if the device is still + -- claimed by the kernel driver. We assume that + -- unbind_device_from_linux() has already been called but it may take + -- some time for the driver to release the device. + if not mem and err.INVAL then + lib.waitfor2("mmap of "..filepath, + function () + mem, err = f:mmap(nil, st.size, "read, write", "shared", 0) + return mem ~= nil or not err.INVAL + end, 5, 1000000) + end + assert(mem, err) return ffi.cast("uint32_t *", mem) end diff --git a/src/lib/ipc/shmem/iftable_mib.lua b/src/lib/ipc/shmem/iftable_mib.lua index 08ac38c0dc..7e62959b90 100644 --- a/src/lib/ipc/shmem/iftable_mib.lua +++ b/src/lib/ipc/shmem/iftable_mib.lua @@ -17,7 +17,7 @@ local iftypes = { [0x1003] = 136, -- l3ipvlan } -function init_snmp (objs, name, counters, directory, interval) +function init_snmp (objs, name, counters, directory, interval, log_date) -- Rudimentary population of a row in the ifTable MIB. Allocation -- of the ifIndex is delegated to the SNMP agent via the name of -- the interface in ifDescr. @@ -30,7 +30,8 @@ function init_snmp (objs, name, counters, directory, interval) end local ifTable = mib:new({ directory = directory or nil, filename = name }) - local logger = logger.new({ module = 'iftable_mib' }) + local logger = logger.new({ date = log_date, + module = 'iftable_mib' }) -- ifTable ifTable:register('ifDescr', 'OctetStr', objs.ifDescr) ifTable:register('ifType', 'Integer32') diff --git a/src/lib/numa.lua b/src/lib/numa.lua index f3ab02418a..67b554c80e 100644 --- a/src/lib/numa.lua +++ b/src/lib/numa.lua @@ -216,7 +216,7 @@ function bind_to_cpu (cpu, skip_perf_checks) end function unbind_numa_node () - if supports_numa() then + if has_numa() then assert(S.set_mempolicy('default')) end bound_numa_node = nil @@ -227,7 +227,7 @@ function bind_to_numa_node (node, policy) if not node then return unbind_numa_node() end assert(not bound_numa_node, "already bound") - if supports_numa() then + if has_numa() then assert(S.set_mempolicy(policy or 'preferred', node)) -- Migrate any pages that might have the wrong affinity. diff --git a/src/lib/poptrie.lua b/src/lib/poptrie.lua index 972468430e..2367eca46e 100644 --- a/src/lib/poptrie.lua +++ b/src/lib/poptrie.lua @@ -37,13 +37,19 @@ end function new (init) local self = setmetatable({}, {__index=Poptrie}) + if init.leaf_t ~= nil then + self.leaf_t = init.leaf_t + assert(self.leaf_t == ffi.typeof("uint16_t") or + self.leaf_t == ffi.typeof("uint32_t"), + "Unsupported leaf type: "..tostring(self.leaf_t)) + end if init.leaves and init.nodes then self.leaves, self.num_leaves = init.leaves, assert(init.num_leaves) self.nodes, self.num_nodes = init.nodes, assert(init.num_nodes) elseif init.nodes or init.leaves or init.directmap then error("partial init") else - self.leaves = array(Poptrie.leaf_t, Poptrie.num_leaves) + self.leaves = array(self.leaf_t, Poptrie.num_leaves) self.nodes = array(Poptrie.node_t, Poptrie.num_nodes) end if init.directmap then @@ -67,8 +73,8 @@ end local asm_cache = {} function Poptrie:configure_lookup () - local config = ("leaf_compression=%s,direct_pointing=%s,s=%s") - :format(self.leaf_compression, self.direct_pointing, self.s) + local config = ("leaf_compression=%s,direct_pointing=%s,s=%s,leaf_t=%s") + :format(self.leaf_compression, self.direct_pointing, self.s, self.leaf_t) if not asm_cache[config] then asm_cache[config] = { poptrie_lookup.generate(self, 32), @@ -89,7 +95,7 @@ end function Poptrie:grow_leaves () self.num_leaves = self.num_leaves * 2 - local new_leaves = array(Poptrie.leaf_t, self.num_leaves) + local new_leaves = array(self.leaf_t, self.num_leaves) ffi.copy(new_leaves, self.leaves, ffi.sizeof(self.leaves)) self.leaves = new_leaves end @@ -439,6 +445,19 @@ function selftest () assert(t:lookup128(s(0x3F)) == 5) assert(t:lookup128(s(0xFF)) == 4) assert(t:lookup128(s(0xF0,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0x0F)) == 6) + -- Test 32-bit leaves + local t = new{direct_pointing=true, s=8, leaf_t=ffi.typeof("uint32_t")} + t:add(s(0xff,0x00), 9, 0xffffffff) + t:add(s(0xff,0x01), 9, 0xfffffffe) + t:build() + assert(t:lookup(s(0xff,0x00)) == 0xffffffff) + assert(t:lookup(s(0xff,0x01)) == 0xfffffffe) + assert(t:lookup32(s(0xff,0x00)) == 0xffffffff) + assert(t:lookup32(s(0xff,0x01)) == 0xfffffffe) + assert(t:lookup64(s(0xff,0x00)) == 0xffffffff) + assert(t:lookup64(s(0xff,0x01)) == 0xfffffffe) + assert(t:lookup128(s(0xff,0x00)) == 0xffffffff) + assert(t:lookup128(s(0xff,0x01)) == 0xfffffffe) -- Random testing local function reproduce (cases, config) diff --git a/src/lib/poptrie_lookup.dasl b/src/lib/poptrie_lookup.dasl index a91216024b..6cc1d529ec 100644 --- a/src/lib/poptrie_lookup.dasl +++ b/src/lib/poptrie_lookup.dasl @@ -19,7 +19,7 @@ local anchor = {} -- (leaf_t *leaves, node_t *nodes, uint8_t *key, base_t *directmap) -- NB: this type is hardcoded here to avoid filling up the ctype table local prototype = ffi.typeof( - "uint16_t (*) (void *, void *, uint8_t *, void *)" + "uint32_t (*) (void *, void *, uint8_t *, void *)" ) -- Assemble a lookup routine @@ -30,7 +30,6 @@ function generate (Poptrie, keysize) assert(Poptrie.s <= 32) assert(Poptrie.leaf_tag == bit.lshift(1, 31)) end - assert(ffi.sizeof(Poptrie.leaf_t) == 2) assert(ffi.sizeof(Poptrie.vector_t) == 8) assert(ffi.sizeof(Poptrie.base_t) == 4) assert(ffi.offsetof(Poptrie.node_t, 'leafvec') == 0) @@ -177,6 +176,10 @@ function lookup (Dst, Poptrie, keysize) -- return leaves[base + bc - 1] | mov index, dword [node+16] -- nodes[index].base0 | add index, eax -- index = base + bc - | movzx eax, word [leaves+index*2-2] -- leaves[index - 1] + if ffi.sizeof(Poptrie.leaf_t) == 2 then + | movzx eax, word [leaves+index*2-2] -- leaves[index - 1] + elseif ffi.sizeof(Poptrie.leaf_t) == 4 then + | mov eax, dword [leaves+index*4-4] -- leaves[index - 1] + else error("NYI") end | ret end diff --git a/src/lib/protocol/README.md b/src/lib/protocol/README.md index e2e18e6dfc..64564c7078 100644 --- a/src/lib/protocol/README.md +++ b/src/lib/protocol/README.md @@ -45,8 +45,9 @@ IPv4 or IPv6 header class. ### Ethernet (lib.protocol.ethernet) The `lib.protocol.ethernet` module contains a class for representing -*Ethernet headers*. The `ethernet` protocol class supports two upper -layer protocols: `lib.protocol.ipv4` and `lib.protocol.ipv6`. +*Ethernet headers*. The `ethernet` protocol class supports three upper +layer protocols: `lib.protocol.ipv4`, `lib.protocol.ipv6`, +and `lib.protocol.dot1q`. — Method **ethernet:new** *config* @@ -112,6 +113,35 @@ Returns a true value if *mac* address denotes a [Broadcast address](https://en.w Returns the MAC address for IPv6 multicast *ip* as defined by RFC2464, section 7. +### IEEE 802.1Q VLAN (lib.protocol.dot1q) + +The `lib.protocol.dot1q` module contains a class for representing +[IEEE 802.1Q](https://en.wikipedia.org/wiki/IEEE_802.1Q) VLAN headers. +The `dot1q` protocol class supports two upper layer protocols: +`lib.protocol.ipv4` and `lib.protocol.ipv6`. + +— Method **dot1q:new** *config* + +Returns a new VLAN header for *config*. *Config* must a be a table +which may contain the following keys: + +* `id` - VLAN id (PCP/DEI/VID) encoded in host byte order. Default is 0. +* `type` - Either `0x0800` or `0x86dd` for IPv4/6 individually. Default + is `0x0`. + +— Method **dot1q:id** *mac* + +— Method **dot1q:type** *type* + +Combined accessor and setter methods. These methods set the values of the +id and type fields of the VLAN header. If no argument +is given the current value is returned. + +— Constant **dot1q.TPID** + +The value `0x8100`. Used as the type in `lib.protocol.ethernet` to +indicate that a IEEE 802.1Q VLAN header follows. + ### IPv4 (lib.protocol.ipv4) @@ -205,6 +235,12 @@ Returns the binary representation of IPv4 address denoted by *string*. Returns the string representation of *ip* address. +— Function **ipv4:pton_cidr** *string* + +Returns the binary representation of the IPv4 address prefix and prefix length +encoded denoted by *string* of the form `/`. +See [CIDR notation](https://en.wikipedia.org/wiki/Classless_Inter-Domain_Routing). + ### IPv6 (lib.protocol.ipv6) @@ -289,6 +325,12 @@ Returns the binary representation of IPv6 address denoted by *string*. Returns the string representation of *ip* address. +— Function **ipv6:pton_cidr** *string* + +Returns the binary representation of the IPv6 address prefix and prefix length +encoded denoted by *string* of the form `/`. +See [CIDR notation](https://en.wikipedia.org/wiki/Classless_Inter-Domain_Routing). + — Function **ipv6:solicited_node_mcast** *ip* Returns the solicited-node multicast address from the given unicast diff --git a/src/lib/protocol/dot1q.lua b/src/lib/protocol/dot1q.lua new file mode 100644 index 0000000000..c85403f95e --- /dev/null +++ b/src/lib/protocol/dot1q.lua @@ -0,0 +1,60 @@ +-- Use of this source code is governed by the Apache 2.0 license; see COPYING. + +module(..., package.seeall) +local ffi = require("ffi") +local lib = require("core.lib") +local header = require("lib.protocol.header") +local ntohs, htons = lib.ntohs, lib.htons + +local dot1q = subClass(header) + +-- Class variables +dot1q._name = "dot1q" +dot1q._ulp = { + class_map = { + [0x0800] = "lib.protocol.ipv4", + [0x86dd] = "lib.protocol.ipv6", + }, + method = 'type' } +dot1q:init( + { + [1] = ffi.typeof[[ + struct { + uint16_t pcp_dei_vid; + uint16_t ether_type; + } __attribute__((packed)) + ]] + }) + +dot1q.TPID = 0x8100 + +-- Class methods + +function dot1q:new (config) + local o = dot1q:superClass().new(self) + o:id(config.id) + o:type(config.type) + return o +end + +-- Instance methods + +function dot1q:id (id) + local h = self:header() + if id ~= nil then + h.pcp_dei_vid = htons(id) + else + return(ntohs(h.pcp_dei_vid)) + end +end + +function dot1q:type (t) + local h = self:header() + if t ~= nil then + h.ether_type = htons(t) + else + return(ntohs(h.ether_type)) + end +end + +return dot1q diff --git a/src/lib/protocol/ethernet.lua b/src/lib/protocol/ethernet.lua index c874f35acb..9e937698b2 100644 --- a/src/lib/protocol/ethernet.lua +++ b/src/lib/protocol/ethernet.lua @@ -18,6 +18,7 @@ ethernet._ulp = { class_map = { [0x0800] = "lib.protocol.ipv4", [0x86dd] = "lib.protocol.ipv6", + [0x8100] = "lib.protocol.dot1q" }, method = 'type' } ethernet:init( diff --git a/src/lib/protocol/ipv4.lua b/src/lib/protocol/ipv4.lua index 4ecd82a9bf..b89c025097 100644 --- a/src/lib/protocol/ipv4.lua +++ b/src/lib/protocol/ipv4.lua @@ -59,6 +59,9 @@ ipv4:init( function ipv4:new (config) local o = ipv4:superClass().new(self) + if not o._recycled then + o._ph = ipv4hdr_pseudo_t() + end o:header().ihl_v_tos = htons(0x4000) -- v4 o:ihl(o:sizeof() / 4) o:dscp(config.dscp or 0) @@ -75,6 +78,14 @@ function ipv4:new (config) return o end +function ipv4:new_from_mem(mem, size) + local o = ipv4:superClass().new_from_mem(self, mem, size) + if not o._recycled then + o._ph = ipv4hdr_pseudo_t() + end + return o +end + function ipv4:pton (p) local in_addr = ffi.new("uint8_t[4]") local result = C.inet_pton(AF_INET, p, in_addr) @@ -90,16 +101,15 @@ function ipv4:ntop (n) return ffi.string(c_str) end -function ipv4:set(addr) - return ipv4:pton(addr) +function ipv4:pton_cidr (p) + local prefix, length = p:match("([^/]*)/([0-9]*)") + return + ipv4:pton(prefix), + assert(tonumber(length), "Invalid length "..length) end -- Instance methods -function ipv4:get() - return ipv4:ntop(self) -end - function ipv4:version (v) return lib.bitfield(16, self:header(), 'ihl_v_tos', 0, 4, v) end @@ -202,7 +212,8 @@ end -- protocol. They differ from the respective values of the ipv6 -- header if extension headers are present. function ipv4:pseudo_header (ulplen, proto) - local ph = ipv4hdr_pseudo_t() + local ph = self._ph + ffi.fill(ph, ffi.sizeof(ph)) local h = self:header() ffi.copy(ph, h.src_ip, 2*ipv4_addr_t_size) -- Copy source and destination ph.ulp_length = htons(ulplen) diff --git a/src/lib/protocol/ipv6.lua b/src/lib/protocol/ipv6.lua index f1edcdde09..c0ec518a51 100644 --- a/src/lib/protocol/ipv6.lua +++ b/src/lib/protocol/ipv6.lua @@ -99,12 +99,11 @@ function ipv6:ntop (n) return ffi.string(c_str) end -function ipv6:get() - return self:ntop(self) -end - -function ipv6:set(addr) - self:pton(addr) +function ipv6:pton_cidr (p) + local prefix, length = p:match("([^/]*)/([0-9]*)") + return + ipv6:pton(prefix), + assert(tonumber(length), "Invalid length "..length) end -- Construct the solicited-node multicast address from the given diff --git a/src/lib/ptree/worker.lua b/src/lib/ptree/worker.lua index f188807802..0717e15044 100644 --- a/src/lib/ptree/worker.lua +++ b/src/lib/ptree/worker.lua @@ -105,6 +105,8 @@ function Worker:main () if not engine.auditlog_enabled then engine.enable_auditlog() end + engine.enable_tick() + engine.setvmprofile("engine") repeat self.breathe() diff --git a/src/lib/timers/ingress_drop_monitor.lua b/src/lib/timers/ingress_drop_monitor.lua index b8b82e6e53..3fe54a2d26 100644 --- a/src/lib/timers/ingress_drop_monitor.lua +++ b/src/lib/timers/ingress_drop_monitor.lua @@ -54,12 +54,10 @@ function new(args) end function IngressDropMonitor:sample () - local app_array = engine.breathe_push_order local sum = self.current_value sum[0] = 0 - for i = 1, #app_array do - local app = app_array[i] - if app.get_rxstats and not app.dead then + for _, app in pairs(engine.app_table) do + if app.get_rxstats then sum[0] = sum[0] + app:get_rxstats().dropped end end diff --git a/src/program/ipfix/README b/src/program/ipfix/README index 421449041e..345e495b93 100644 --- a/src/program/ipfix/README +++ b/src/program/ipfix/README @@ -1,5 +1,5 @@ Usage: - snabb ipfix probe + snabb ipfix probe | probe_rss | stats Use --help for per-command usage. Example: diff --git a/src/program/ipfix/lib.lua b/src/program/ipfix/lib.lua new file mode 100644 index 0000000000..019e2f233f --- /dev/null +++ b/src/program/ipfix/lib.lua @@ -0,0 +1,447 @@ +module(..., package.seeall) + +local now = require("core.app").now +local lib = require("core.lib") +local shm = require("core.shm") +local counter = require("core.counter") +local app_graph = require("core.config") +local link = require("core.link") +local pci = require("lib.hardware.pci") +local numa = require("lib.numa") +local ipv4 = require("lib.protocol.ipv4") +local ethernet = require("lib.protocol.ethernet") +local macaddress = require("lib.macaddress") +local S = require("syscall") +local basic = require("apps.basic.basic_apps") +local arp = require("apps.ipv4.arp") +local ipfix = require("apps.ipfix.ipfix") +local template = require("apps.ipfix.template") +local rss = require("apps.rss.rss") +local ifmib = require("lib.ipc.shmem.iftable_mib") +local Transmitter = require("apps.interlink.transmitter") + + +local ifmib_dir = '/ifmib' + +-- apps that can be used as an input or output for the exporter +local in_apps, out_apps = {}, {} + +local function parse_spec (spec, delimiter) + local t = {} + for s in spec:split(delimiter or ':') do + table.insert(t, s) + end + return t +end + +function in_apps.pcap (path) + return { input = "input", + output = "output" }, + { require("apps.pcap.pcap").PcapReader, path } +end + +function out_apps.pcap (path) + return { input = "input", + output = "output" }, + { require("apps.pcap.pcap").PcapWriter, path } +end + +function out_apps.tap_routed (device) + return { input = "input", + output = "output" }, + { require("apps.tap.tap").Tap, { name = device } } +end + +function in_apps.raw (device) + return { input = "rx", + output = "tx" }, + { require("apps.socket.raw").RawSocket, device } +end +out_apps.raw = in_apps.raw + +function in_apps.tap (device) + return { input = "input", + output = "output" }, + { require("apps.tap.tap").Tap, device } +end +out_apps.tap = in_apps.tap + +function in_apps.interlink (name) + return { input = nil, + output = "output" }, + { require("apps.interlink.receiver"), nil } +end + +function in_apps.pci (spec) + local device, rxq = unpack(parse_spec(spec, '/')) + local device_info = pci.device_info(device) + local conf = { pciaddr = device } + if device_info.driver == 'apps.intel_mp.intel_mp' then + local rxq = (rxq and tonumber(rxq)) or 0 + conf.rxq = rxq + conf.rxcounter = rxq + conf.ring_buffer_size = 32768 + elseif device_info.driver == 'apps.mellanox.connectx' then + conf = { + pciaddress = device, + queue = rxq + } + end + return { input = device_info.rx, output = device_info.tx }, + { require(device_info.driver).driver, conf } +end +out_apps.pci = in_apps.pci + +function create_ifmib(stats, ifname, ifalias, log_date) + -- stats can be nil in case this process is not the master + -- of the device + if not stats then return end + if not shm.exists(ifmib_dir) then + shm.mkdir(ifmib_dir) + end + ifmib.init_snmp( { ifDescr = ifname, + ifName = ifname, + ifAlias = ifalias or "NetFlow input", }, + ifname:gsub('/', '-'), stats, + shm.root..ifmib_dir, 5, log_date) +end + +function value_to_string (value, string) + string = string or '' + local type = type(value) + if type == 'table' then + string = string.."{ " + if #value == 0 then + for key, value in pairs(value) do + string = string..key.." = " + string = value_to_string(value, string)..", " + end + else + for _, value in ipairs(value) do + string = value_to_string(value, string)..", " + end + end + string = string.." }" + elseif type == 'string' then + string = string..("%q"):format(value) + else + string = string..("%s"):format(value) + end + return string +end + +probe_config = { + -- Probe-specific + output_type = {required = true}, + output = { required = true }, + input_type = { default = nil }, + input = { default = nil }, + exporter_mac = { default = nil }, + -- Passed on to IPFIX app + active_timeout = { default = nil }, + idle_timeout = { default = nil }, + flush_timeout = { default = nil }, + cache_size = { default = nil }, + max_load_factor = { default = nil }, + scan_time = { default = nil }, + observation_domain = { default = nil }, + template_refresh_interval = { default = nil }, + ipfix_version = { default = nil }, + exporter_ip = { required = true }, + collector_ip = { required = true }, + collector_port = { required = true }, + mtu = { default = nil }, + templates = { required = true }, + maps = { default = {} }, + maps_logfile = { default = nil }, + instance = { default = 1 }, + add_packet_metadata = { default = true }, + log_date = { default = false }, + scan_protection = { default = {} } +} + +function configure_graph (arg, in_graph) + local config = lib.parse(arg, probe_config) + + local in_link, in_app + if config.input_type then + assert(in_apps[config.input_type], + "unknown input type: "..config.input_type) + assert(config.input, "Missing input parameter") + in_link, in_app = in_apps[config.input_type](config.input) + end + assert(out_apps[config.output_type], + "unknown output type: "..config.output_type) + local out_link, out_app = out_apps[config.output_type](config.output) + + if config.output_type == "tap_routed" then + local tap_config = out_app[2] + tap_config.mtu = config.mtu + end + + local function mk_ipfix_config() + return { active_timeout = config.active_timeout, + idle_timeout = config.idle_timeout, + flush_timeout = config.flush_timeout, + cache_size = config.cache_size, + max_load_factor = config.max_load_factor, + scan_time = config.scan_time, + observation_domain = config.observation_domain, + template_refresh_interval = + config.template_refresh_interval, + ipfix_version = config.ipfix_version, + exporter_ip = config.exporter_ip, + collector_ip = config.collector_ip, + collector_port = config.collector_port, + mtu = config.mtu - 14, + templates = config.templates, + maps = config.maps, + maps_log_fh = config.maps_logfile and + assert(io.open(config.maps_logfile, "a")) or nil, + instance = config.instance, + add_packet_metadata = config.add_packet_metadata, + log_date = config.log_date, + scan_protection = config.scan_protection } + end + + local ipfix_config = mk_ipfix_config() + local ipfix_name = "ipfix"..config.instance + local out_name = "out"..config.instance + local sink_name = "sink"..config.instance + + local graph = in_graph or app_graph.new() + if config.input then + local in_name = "in" + if config.input_type == "interlink" then + in_name = config.input + end + app_graph.app(graph, in_name, unpack(in_app)) + app_graph.link(graph, in_name ..".".. in_link.output .. " -> " + ..ipfix_name..".input") + end + app_graph.app(graph, ipfix_name, ipfix.IPFIX, ipfix_config) + app_graph.app(graph, out_name, unpack(out_app)) + + -- use ARP for link-layer concerns unless the output is connected + -- to a pcap writer or a routed tap interface + if (config.output_type ~= "pcap" and + config.output_type ~= "tap_routed") then + local arp_name = "arp"..config.instance + local arp_config = { self_mac = config.exporter_mac and + ethernet:pton(config.exporter_mac), + self_ip = ipv4:pton(config.exporter_ip), + next_ip = ipv4:pton(config.collector_ip) } + app_graph.app(graph, arp_name, arp.ARP, arp_config) + app_graph.app(graph, sink_name, basic.Sink) + + app_graph.link(graph, out_name.."."..out_link.output.." -> " + ..arp_name..".south") + + -- with UDP, ipfix doesn't need to handle packets from the collector + app_graph.link(graph, arp_name..".north -> "..sink_name..".input") + + app_graph.link(graph, ipfix_name..".output -> "..arp_name..".north") + app_graph.link(graph, arp_name..".south -> " + ..out_name.."."..out_link.input) + else + app_graph.link(graph, ipfix_name..".output -> " + ..out_name.."."..out_link.input) + app_graph.app(graph, sink_name, basic.Sink) + app_graph.link(graph, out_name.."."..out_link.output.." -> " + ..sink_name..".input") + end + + engine.configure(graph) + + if config.input_type and config.input_type == "pci" then + local pciaddr = unpack(parse_spec(config.input, '/')) + create_ifmib(engine.app_table['in'].stats, (pciaddr:gsub("[:%.]", "_")), + config.log_date) + end + if config.output_type == "tap_routed" then + create_ifmib(engine.app_table[out_name].shm, config.output, + "IPFIX Observation Domain "..config.observation_domain, + config.log_date) + end + + if config.output_type == "tap_routed" then + local tap_config = out_app[2] + local name = tap_config.name + local tap_sysctl_base = "net/ipv4/conf/"..name + assert(S.sysctl(tap_sysctl_base.."/rp_filter", '0')) + assert(S.sysctl(tap_sysctl_base.."/accept_local", '1')) + assert(S.sysctl(tap_sysctl_base.."/forwarding", '1')) + local out_stats = engine.app_table[out_name].shm + local ipfix_config = mk_ipfix_config() + ipfix_config.exporter_eth_dst = + tostring(macaddress:new(counter.read(out_stats.macaddr))) + app_graph.app(graph, ipfix_name, ipfix.IPFIX, ipfix_config) + engine.configure(graph) + end + + return config, graph +end + +function parse_jit_option_fn (jit) + return function (arg) + if arg:match("^v") then + local file = arg:match("^v=(.*)") + if file == '' then file = nil end + jit.v = file + elseif arg:match("^p") then + local opts, file = arg:match("^p=([^,]*),?(.*)") + if file == '' then file = nil end + jit.p = { opts, file } + elseif arg:match("^dump") then + local opts, file = arg:match("^dump=([^,]*),?(.*)") + if file == '' then file = nil end + jit.dump = { opts, file } + elseif arg:match("^opt") then + local opt = arg:match("^opt=(.*)") + table.insert(jit.opts, opt) + elseif arg:match("^tprof") then + jit.traceprof = true + end + end +end + +local function set_jit_options (jit) + if not jit then return end + if jit.v then + require("jit.v").start(jit.v) + end + if jit.p and #jit.p > 0 then + require("jit.p").start(unpack(jit.p)) + end + if jit.traceprof then + require("lib.traceprof.traceprof").start() + end + if jit.dump and #jit.dump > 0 then + require("jit.dump").on(unpack(jit.dump)) + end + if jit.opts and #jit.opts > 0 then + require("jit.opt").start(unpack(jit.opts)) + end +end + +local function clear_jit_options (jit) + if not jit then return end + if jit.dump then + require("jit.dump").off() + end + if jit.traceprof then + require("lib.traceprof.traceprof").stop() + end + if jit.p then + require("jit.p").stop() + end + if jit.v then + require("jit.v").stop() + end +end + +-- Run an instance of the ipfix probe +function run (arg, duration, busywait, cpu, jit) + local config = configure_graph(arg) + + if cpu then numa.bind_to_cpu(cpu) end + set_jit_options(jit) + + local done + if not duration and config.input_type == "pcap" then + done = function () + return engine.app_table['in'].done + end + end + + local t1 = now() + + if busywait ~= nil then + engine.busywait = busywait + end + engine.main({ duration = duration, done = done, measure_latency = false }) + + clear_jit_options(jit) + + local t2 = now() + local stats = link.stats(engine.app_table['ipfix'..config.instance].input.input) + print("IPFIX probe stats:") + local comma = lib.comma_value + print(string.format("bytes: %s packets: %s bps: %s Mpps: %s", + comma(stats.rxbytes), + comma(stats.rxpackets), + comma(math.floor((stats.rxbytes * 8) / (t2 - t1))), + comma(stats.rxpackets / ((t2 - t1) * 1000000)))) + +end + +-- Run an instance of the RSS app. The output links can either be +-- interlinks or regular links with an instance of an ipfix probe +-- attached. +function run_rss(config, inputs, outputs, duration, busywait, cpu, jit, log_date) + if cpu then numa.bind_to_cpu(cpu) end + set_jit_options(jit) + + local graph = app_graph.new() + app_graph.app(graph, "rss", rss.rss, config) + + -- An input describes a physical interface + local tags, in_app_specs = {}, {} + for n, input in ipairs(inputs) do + local suffix = #inputs > 1 and n or '' + local input_name = "input"..suffix + local in_link, in_app = in_apps.pci(input.device.."/"..input.rxq) + input.config.pciaddr = input.device + table.insert(in_app_specs, + { pciaddr = input.device, + name = input_name, + ifname = input.name or + (input.device:gsub("[:%.]", "_")), + ifalias = input.description }) + app_graph.app(graph, input_name, unpack(in_app)) + local link_name = "input"..suffix + if input.tag then + local tag = input.tag + assert(not(tags[tag]), "Tag not unique: "..tag) + link_name = "vlan"..tag + end + app_graph.link(graph, input_name.."."..in_link.output + .." -> rss."..link_name) + end + + -- An output describes either an interlink or a complete ipfix app + for _, output in ipairs(outputs) do + if output.type == 'interlink' then + -- Keys + -- link_name name of the link + app_graph.app(graph, output.link_name, Transmitter) + app_graph.link(graph, "rss."..output.link_name.." -> " + ..output.link_name..".input") + else + -- Keys + -- link_name name of the link + -- args probe configuration + -- instance # of embedded instance + output.args.instance = output.instance + local config = configure_graph(output.args, graph) + app_graph.link(graph, "rss."..output.link_name + .." -> ipfix"..output.instance..".input") + end + end + + engine.configure(graph) + for _, spec in ipairs(in_app_specs) do + create_ifmib(engine.app_table[spec.name].stats, + spec.ifname, spec.ifalias, log_date) + end + require("jit").flush() + + local engine_opts = { no_report = true, measure_latency = false } + if duration ~= 0 then engine_opts.duration = duration end + if busywait ~= nil then + engine.busywait = busywait + end + engine.main(engine_opts) + + clear_jit_options(jit) +end diff --git a/src/program/ipfix/probe/README b/src/program/ipfix/probe/README index 0570d74a89..7eec821dba 100644 --- a/src/program/ipfix/probe/README +++ b/src/program/ipfix/probe/README @@ -1,4 +1,4 @@ -Usage: snabb ipfix probe [options] +Usage: snabb ipfix probe [options] [