diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..588ee6f --- /dev/null +++ b/Makefile @@ -0,0 +1,108 @@ +# Copyright (c) 2018. TIG developer. + +ifdef V +Q = +else +Q = @ +endif + +ifeq ($(machine),) +machine = native +endif + +RTE_SDK = $(CURDIR)/dpdk +export RTE_SDK + +RTE_TARGET ?= x86_64-native-linuxapp-gcc +export RTE_TARGET + +ifeq ($(bindir),) +bindir = /usr/local/jupiter/bin +endif + +ifeq ($(tooldir),) +tooldir = /usr/local/jupiter/tool +endif + +ifeq ($(kmoddir),) +kmoddir = /usr/local/jupiter/kmod +endif + +ifeq ($(confdir),) +confdir = /usr/local/jupiter +endif + +VERSION ?= 0.1 + +.PHONY: all +all: dpdk jupiter + +.PHONY: dpdk +dpdk: + $(Q)cd $(RTE_SDK) && $(MAKE) O=$(RTE_TARGET) T=$(RTE_TARGET) config + $(Q)cd $(RTE_SDK) && sed -ri 's,(RTE_MACHINE=).*,\1$(machine),' $(RTE_TARGET)/.config + $(Q)cd $(RTE_SDK) && sed -ri 's,(RTE_APP_TEST=).*,\1n,' $(RTE_TARGET)/.config + $(Q)cd $(RTE_SDK) && sed -ri 's,(RTE_LIBRTE_PMD_PCAP=).*,\1y,' $(RTE_TARGET)/.config + $(Q)cd $(RTE_SDK) && sed -ri 's,(RTE_KNI_KMOD_ETHTOOL=).*,\1n,' $(RTE_TARGET)/.config + $(Q)cd $(RTE_SDK) && $(MAKE) O=$(RTE_TARGET) + +.PHONY: jupiter +jupiter: + $(Q)cd lib && $(MAKE) O=$(RTE_TARGET) + $(Q)cd cmd && $(MAKE) O=$(RTE_TARGET) + $(Q)cd core && $(MAKE) O=$(RTE_TARGET) + +.PHONY: install +install: + @echo ================== Installing $(DESTDIR)/ + $(Q)test -d $(DESTDIR)/$(bindir) || mkdir -p $(DESTDIR)/$(bindir) + $(Q)cp -a cmd/$(RTE_TARGET)/jupiter-ctl $(DESTDIR)/$(bindir) + $(Q)cp -a core/$(RTE_TARGET)/jupiter-service $(DESTDIR)/$(bindir) + $(Q)cp -a $(RTE_SDK)/$(RTE_TARGET)/app/dpdk-pdump $(DESTDIR)/$(bindir)/jupiter-pdump + + $(Q)test -d $(DESTDIR)/$(tooldir) || mkdir -p $(DESTDIR)/$(tooldir) + $(Q)cp -a $(RTE_SDK)/usertools/cpu_layout.py $(DESTDIR)/$(tooldir)/cpu_layout.py + $(Q)cp -a $(RTE_SDK)/usertools/dpdk-devbind.py $(DESTDIR)/$(tooldir)/dpdk-devbind.py + + $(Q)test -d $(DESTDIR)/$(kmoddir) || mkdir -p $(DESTDIR)/$(kmoddir) + $(Q)cp -a $(RTE_SDK)/$(RTE_TARGET)/kmod/igb_uio.ko $(DESTDIR)/$(kmoddir)/igb_uio.ko + $(Q)cp -a $(RTE_SDK)/$(RTE_TARGET)/kmod/rte_kni.ko $(DESTDIR)/$(kmoddir)/rte_kni.ko + + $(Q)test -d $(DESTDIR)/$(confdir) || mkdir -p $(DESTDIR)/$(confdir) + $(Q)cp -a jupiter.cfg $(DESTDIR)/$(confdir) + @echo ================== Installation in $(DESTDIR)/ complete + +.PHONY: uninstall +uninstall: + @echo ================== Uninstalling $(DESTDIR)/ + $(Q)$(if test -d $(DESTDIR)/$(bindir)/jupiter-ctl, rm -rf $(DESTDIR)/$(bindir)/jupiter-ctl,) + $(Q)$(if test -d $(DESTDIR)/$(bindir)/jupiter-service, rm -rf $(DESTDIR)/$(bindir)/jupiter-service,) + $(Q)$(if test -d $(DESTDIR)/$(bindir)/jupiter-pdump, rm -rf $(DESTDIR)/$(bindir)/jupiter-pdump,) + $(Q)$(if test -d $(DESTDIR)/$(tooldir)/cpu_layout.py, rm -rf $(DESTDIR)/$(tooldir)/cpu_layout.py,) + $(Q)$(if test -d $(DESTDIR)/$(tooldir)/dpdk-devbind.py, rm -rf $(DESTDIR)/$(tooldir)/dpdk-devbind.py,) + $(Q)$(if test -d $(DESTDIR)/$(kmoddir)/igb_uio.ko, rm -rf $(DESTDIR)/$(kmoddir)/igb_uio.ko,) + $(Q)$(if test -d $(DESTDIR)/$(kmoddir)/rte_kni.ko, rm -rf $(DESTDIR)/$(kmoddir)/rte_kni.ko,) + $(Q)$(if test -d $(DESTDIR)/$(confdir)/jupiter.cfg, rm -rf $(DESTDIR)/$(confdir)/jupiter.cfg,) + @echo ================== Uninstallation in $(DESTDIR)/ complete + +.PHONY: rpm-pkg +rpm-pkg: + $(Q)$(if test -d rpmbuild, rm -rf rpmbuild,) + $(Q)$(if test -e jupiter-$(VERSION).tar.xz, rm -rf jupiter-$(VERSION).tar.xz,) + $(Q)tar -cf jupiter-$(VERSION).tar.xz --xform 's#^#jupiter-$(VERSION)/#' * + $(Q)mkdir -p rpmbuild/{RPMS,SRPMS,BUILD,SOURCES,SPECS} + $(Q)mv jupiter-$(VERSION).tar.xz rpmbuild/SOURCES/jupiter-$(VERSION).tar.xz + $(Q)cp rpm.spec rpmbuild/SPECS + $(Q)rpmbuild -bb \ + --define "_topdir $(PWD)/rpmbuild" \ + --define "_version $(VERSION)" \ + --define "_machine $(machine)" \ + rpmbuild/SPECS/rpm.spec + + +.PHONY: clean +clean: + $(Q)cd $(RTE_SDK) && $(MAKE) O=$(RTE_TARGET) clean + $(Q)cd lib && $(MAKE) O=$(RTE_TARGET) clean + $(Q)cd cmd && $(MAKE) O=$(RTE_TARGET) clean + $(Q)cd core && $(MAKE) O=$(RTE_TARGET) clean diff --git a/cmd/Makefile b/cmd/Makefile new file mode 100644 index 0000000..fd68d1e --- /dev/null +++ b/cmd/Makefile @@ -0,0 +1,21 @@ +# Copyright (c) 2018. TIG developer. + +ifeq ($(RTE_SDK),) +$(error "Please define RTE_SDK environment variable") +endif + +# Default target, can be overriden by command line or environment +include $(RTE_SDK)/mk/rte.vars.mk + +# binary name +APP = jupiter-ctl + +# all source are stored in SRCS-y +SRCS-y := main.c + +CFLAGS += $(WERROR_FLAGS) -g -O3 + +CFLAGS += -I$(SRCDIR)/../lib/libcmd/$(RTE_TARGET)/include +LDLIBS += -L$(SRCDIR)/../lib/libcmd/$(RTE_TARGET)/lib -lcmd + +include $(RTE_SDK)/mk/rte.extapp.mk diff --git a/cmd/main.c b/cmd/main.c new file mode 100644 index 0000000..d0d46bb --- /dev/null +++ b/cmd/main.c @@ -0,0 +1,59 @@ +/* Copyright (c) 2018. TIG developer. */ + +#include +#include + +#include "unixctl_command.h" + +static const char *default_unix_sock_path = "/var/run/jupiter.sock"; + +static void +usage(const char *progname) { + printf("Usage: %s COMMAND [ARG...] [--unixsock=%s]\n", progname, + default_unix_sock_path); +} + +static const char * +parse_progname(const char *arg) { + char *p; + if ((p = strrchr(arg, '/')) != NULL) + return strdup(p + 1); + return strdup(arg); +} + +int +main(int argc, char **argv) { + const char *unix_sock_path = NULL; + char cmdline[1024] = {0}; + int client, ret; + int i; + + for (i = 1; i < argc; i++) { + if (strncmp("--unixsock=", argv[i], strlen("--unixsock=")) == 0) { + unix_sock_path = strdup(argv[i] + strlen("--unixsock=")); + } else { + strcat(cmdline, argv[i]); + strcat(cmdline, " "); + } + } + if (!unix_sock_path) + unix_sock_path = default_unix_sock_path; + if (strlen(cmdline) == 0) { + const char *progname; + progname = parse_progname(argv[0]); + usage(progname); + strcat(cmdline, " "); + } + client = unixctl_client_create(unix_sock_path); + if (client < 0) { + fprintf(stderr, "Create unix socket client failed.\n"); + return -1; + } + ret = unixctl_client_request(client, cmdline); + if (ret < 0) { + fprintf(stderr, "Unable to request unix socket server.\n"); + return -1; + } + return ret != 0 ? -1 : 0; +} + diff --git a/core/Makefile b/core/Makefile new file mode 100644 index 0000000..0f0bf20 --- /dev/null +++ b/core/Makefile @@ -0,0 +1,30 @@ +# Copyright (c) 2018. TIG developer. + +ifeq ($(RTE_SDK),) +$(error "Please define RTE_SDK environment variable") +endif + +ifeq ($(RTE_TARGET),) +$(error "Please define RTE_TARGET environment variable") +endif + +include $(RTE_SDK)/mk/rte.vars.mk + +# binary name +APP = jupiter-service + +# all source are stored in SRCS-y +SRCS-y := main.c lb_device.c lb_arp.c lb_parser.c lb_service.c lb_scheduler.c \ + lb_conn.c lb_proto.c lb_proto_tcp.c lb_toa.c lb_synproxy.c \ + lb_proto_udp.c lb_proto_icmp.c lb_tcp_secret_seq.c \ + lb_config.c + +CFLAGS += $(WERROR_FLAGS) -g -O3 + +CFLAGS += -I$(SRCDIR)/../lib/libconhash/$(RTE_TARGET)/include +LDLIBS += -L$(SRCDIR)/../lib/libconhash/$(RTE_TARGET)/lib -lconhash + +CFLAGS += -I$(SRCDIR)/../lib/libcmd/$(RTE_TARGET)/include +LDLIBS += -L$(SRCDIR)/../lib/libcmd/$(RTE_TARGET)/lib -lcmd + +include $(RTE_SDK)/mk/rte.extapp.mk diff --git a/core/lb_arp.c b/core/lb_arp.c new file mode 100644 index 0000000..baf0aa7 --- /dev/null +++ b/core/lb_arp.c @@ -0,0 +1,336 @@ +/* Copyright (c) 2018. TIG developer. */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "lb_arp.h" +#include "lb_clock.h" +#include "lb_device.h" +#include "lb_parser.h" + +#define LB_MAX_ARP 4096 + +struct arp_entry { + struct arp_table *tbl; + struct ether_addr ha; + uint32_t ip; + uint32_t timeout; + uint32_t create_time; + rte_atomic32_t use_time; + struct rte_timer timer; +}; + +struct arp_table { + struct rte_hash *hash; + struct arp_entry *entries; + uint32_t timeout; + rte_rwlock_t rwlock; +}; + +static struct arp_table arp_tbl[RTE_MAX_ETHPORTS]; +static uint32_t arp_timeout = 1800 * LB_CLOCK_HZ; + +#define ARP_TABLE_RWLOCK_RLOCK(t) rte_rwlock_read_lock(&(t)->rwlock) +#define ARP_TABLE_RWLOCK_RUNLOCK(t) rte_rwlock_read_unlock(&(t)->rwlock) +#define ARP_TABLE_RWLOCK_WLOCK(t) rte_rwlock_write_lock(&(t)->rwlock) +#define ARP_TABLE_RWLOCK_WUNLOCK(t) rte_rwlock_write_unlock(&(t)->rwlock) + +#define MAC_ADDR_CMP 0xFFFFFFFFFFFFULL + +static inline int __attribute__((always_inline)) +ether_addr_cmp(struct ether_addr *ea, struct ether_addr *eb) { + return ((*(uint64_t *)ea ^ *(uint64_t *)eb) & MAC_ADDR_CMP) == 0; +} + +static void +arp_expire(struct rte_timer *t, void *arg) { + struct arp_entry *entry = arg; + struct arp_table *tbl = entry->tbl; + uint32_t curr_time, use_time; + int rc; + + curr_time = LB_CLOCK(); + use_time = rte_atomic32_read(&entry->use_time); + if (curr_time - use_time >= entry->timeout) { + ARP_TABLE_RWLOCK_WLOCK(tbl); + rte_hash_del_key(tbl->hash, &entry->ip); + ARP_TABLE_RWLOCK_WUNLOCK(tbl); + rc = rte_timer_stop(t); + if (rc < 0) { + RTE_LOG(WARNING, USER1, + "%s(): Stop arp timer failed, ip(0x%08x).\n", __func__, + rte_be_to_cpu_32(entry->ip)); + } + } +} + +void +lb_arp_input(struct rte_mbuf *pkt, uint16_t port_id) { + struct arp_table *tbl; + struct arp_entry *entry; + struct arp_hdr *arph; + uint32_t sip; + struct ether_addr *sha; + int i; + + tbl = &arp_tbl[port_id]; + arph = rte_pktmbuf_mtod_offset(pkt, struct arp_hdr *, ETHER_HDR_LEN); + sip = arph->arp_data.arp_sip; + sha = &arph->arp_data.arp_sha; + i = rte_hash_lookup(tbl->hash, &sip); + if (i < 0) { + /* add */ + ARP_TABLE_RWLOCK_WLOCK(tbl); + i = rte_hash_add_key(tbl->hash, &sip); + if (i < 0) { + ARP_TABLE_RWLOCK_WUNLOCK(tbl); + RTE_LOG(WARNING, USER1, + "%s(): Add key(0x%08X) to arp table failed, %s.\n", + __func__, rte_be_to_cpu_32(sip), strerror(-i)); + return; + } + + entry = &tbl->entries[i]; + entry->tbl = tbl; + ether_addr_copy(sha, &entry->ha); + entry->ip = sip; + entry->timeout = tbl->timeout; + entry->create_time = LB_CLOCK(); + rte_atomic32_set(&entry->use_time, entry->create_time); + rte_timer_init(&entry->timer); + rte_timer_reset(&entry->timer, SEC_TO_CYCLES(5), PERIODICAL, + rte_get_master_lcore(), arp_expire, entry); + ARP_TABLE_RWLOCK_WUNLOCK(tbl); + } else { + /* update */ + entry = &tbl->entries[i]; + if (!ether_addr_cmp(sha, &entry->ha)) { + ARP_TABLE_RWLOCK_WLOCK(tbl); + ether_addr_copy(sha, &entry->ha); + rte_atomic32_set(&entry->use_time, LB_CLOCK()); + ARP_TABLE_RWLOCK_WUNLOCK(tbl); + } + } +} + +static int +arp_send(uint16_t type, uint32_t dst_ip, uint32_t src_ip, + struct ether_addr *dst_ha, struct ether_addr *src_ha, + uint16_t port_id) { + static const struct ether_addr bc_ha = { + {0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}; + struct lb_device *dev = &lb_devices[port_id]; + struct rte_mbuf *m; + struct ether_hdr *eth; + struct arp_hdr *ah; + uint32_t lcore_id; + uint16_t txq_id; + struct rte_eth_dev_tx_buffer *tx_buffer; + + m = lb_device_pktmbuf_alloc(port_id); + if (m == NULL) { + RTE_LOG(WARNING, USER1, "%s(): Alloc packet mbuf failed.\n", __func__); + return -1; + } + + eth = rte_pktmbuf_mtod(m, struct ether_hdr *); + if (dst_ha != NULL) { + ether_addr_copy(src_ha, ð->s_addr); + ether_addr_copy(dst_ha, ð->d_addr); + eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_ARP); + } else { + ether_addr_copy(src_ha, ð->s_addr); + ether_addr_copy(&bc_ha, ð->d_addr); + eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_ARP); + } + + ah = rte_pktmbuf_mtod_offset(m, struct arp_hdr *, ETHER_HDR_LEN); + ah->arp_hrd = rte_cpu_to_be_16(ARP_HRD_ETHER); + ah->arp_pro = rte_cpu_to_be_16(ETHER_TYPE_IPv4); + ah->arp_hln = 0x6; + ah->arp_pln = 0x4; + ah->arp_op = rte_cpu_to_be_16(type); + + ether_addr_copy(src_ha, &ah->arp_data.arp_sha); + ah->arp_data.arp_sip = src_ip; + if (dst_ha != NULL) + ether_addr_copy(dst_ha, &ah->arp_data.arp_tha); + else + memset(&ah->arp_data.arp_tha, 0, sizeof(struct ether_addr)); + ah->arp_data.arp_tip = dst_ip; + + m->data_len = ETHER_HDR_LEN + sizeof(*ah); + m->pkt_len = ETHER_HDR_LEN + sizeof(*ah); + + lcore_id = rte_lcore_id(); + txq_id = dev->lcore_conf[lcore_id].txq_id; + tx_buffer = dev->tx_buffer[lcore_id]; + rte_eth_tx_buffer(port_id, txq_id, tx_buffer, m); + + return 0; +} + +int +lb_arp_request(uint32_t dip, uint16_t port_id) { + struct lb_device *dev = &lb_devices[port_id]; + + return arp_send(ARP_OP_REQUEST, dip, dev->ipv4, NULL, &dev->ha, port_id); +} + +int +lb_arp_find(uint32_t ip, struct ether_addr *mac, uint16_t port_id) { + struct arp_table *tbl; + int i; + + tbl = &arp_tbl[port_id]; + ARP_TABLE_RWLOCK_RLOCK(tbl); + i = rte_hash_lookup(tbl->hash, &ip); + if (i >= 0) { + ether_addr_copy(&tbl->entries[i].ha, mac); + rte_atomic32_set(&tbl->entries[i].use_time, LB_CLOCK()); + } + ARP_TABLE_RWLOCK_RUNLOCK(tbl); + + return i; +} + +int +lb_arp_init(void) { + uint16_t nb_ports, i; + struct rte_hash_parameters params; + char name[RTE_HASH_NAMESIZE]; + int socket_id; + + nb_ports = rte_eth_dev_count(); + for (i = 0; i < nb_ports; i++) { + if (lb_devices[i].type != LB_DEV_T_NORM && + lb_devices[i].type != LB_DEV_T_MASTER) { + continue; + } + + socket_id = rte_eth_dev_socket_id(i); + + memset(¶ms, 0, sizeof(params)); + snprintf(name, sizeof(name), "arphash%u", i); + params.name = name; + params.entries = LB_MAX_ARP; + params.key_len = sizeof(uint32_t); + params.hash_func = rte_hash_crc; + params.socket_id = socket_id; + + arp_tbl[i].hash = rte_hash_create(¶ms); + if (arp_tbl[i].hash == NULL) { + RTE_LOG(ERR, USER1, "%s(): Create arp hash (%s) failed, %s.\n", + __func__, name, rte_strerror(rte_errno)); + return -1; + } + + arp_tbl[i].entries = + rte_zmalloc_socket(NULL, LB_MAX_ARP * sizeof(struct arp_entry), + RTE_CACHE_LINE_SIZE, socket_id); + if (arp_tbl[i].entries == NULL) { + RTE_LOG(ERR, USER1, "%s(): Alloc memory for arp table failed.\n", + __func__); + return -1; + } + rte_rwlock_init(&arp_tbl[i].rwlock); + + arp_tbl[i].timeout = arp_timeout; + + RTE_LOG(INFO, USER1, + "%s(): Create arp table for port(%s) on socket%d.\n", __func__, + lb_devices[i].name, socket_id); + } + + return 0; +} + +static void +arp_list_cb(int fd, __attribute__((unused)) char *argv[], + __attribute__((unused)) int argc) { + uint16_t nb_ports, i; + const void *key; + void *data; + uint32_t next; + struct arp_entry *entry; + char ip[32], mac[32]; + uint32_t ctime, sec; + int rc; + + unixctl_command_reply( + fd, "IPaddress HWaddress Iface AliveTime\n"); + + nb_ports = rte_eth_dev_count(); + ctime = LB_CLOCK(); + + for (i = 0; i < nb_ports; i++) { + if (lb_devices[i].type != LB_DEV_T_NORM && + lb_devices[i].type != LB_DEV_T_MASTER) { + continue; + } + + next = 0; + while ((rc = rte_hash_iterate(arp_tbl[i].hash, &key, &data, &next)) >= + 0) { + entry = &arp_tbl[i].entries[rc]; + ipv4_addr_tostring(entry->ip, ip, sizeof(ip)); + mac_addr_tostring(&entry->ha, mac, sizeof(mac)); + sec = LB_CLOCK_TO_SEC(ctime - entry->create_time); + unixctl_command_reply(fd, "%-15s %-17s %-10s %u\n", ip, mac, + lb_devices[i].name, sec); + } + } +} + +UNIXCTL_CMD_REGISTER("arp/list", "", "", 0, 0, arp_list_cb); + +static void +arp_timeout_cb(int fd, char *argv[], int argc) { + uint32_t timeout, echo = 0; + int rc; + uint16_t nb_ports, i; + + if (argc == 0) { + echo = 1; + } else { + rc = parser_read_uint32(&timeout, argv[0]); + if (rc < 0) { + unixctl_command_reply_error(fd, "Invalid parameter: %s.\n", + argv[0]); + return; + } + timeout = SEC_TO_LB_CLOCK(timeout); + } + + nb_ports = rte_eth_dev_count(); + for (i = 0; i < nb_ports; i++) { + if (lb_devices[i].type != LB_DEV_T_NORM && + lb_devices[i].type != LB_DEV_T_MASTER) { + continue; + } + + if (echo) { + unixctl_command_reply(fd, "%u\n", + LB_CLOCK_TO_SEC(arp_tbl[i].timeout)); + return; + } else { + arp_tbl[i].timeout = timeout; + } + } +} + +UNIXCTL_CMD_REGISTER("arp/timeout", "[SEC]", "", 0, 1, arp_timeout_cb); + diff --git a/core/lb_arp.h b/core/lb_arp.h new file mode 100644 index 0000000..c8c5a0d --- /dev/null +++ b/core/lb_arp.h @@ -0,0 +1,14 @@ +/* Copyright (c) 2018. TIG developer. */ + +#ifndef __LB_ARP_H__ +#define __LB_ARP_H__ + +struct rte_mbuf; + +int lb_arp_init(void); +int lb_arp_request(uint32_t dip, uint16_t port_id); +void lb_arp_input(struct rte_mbuf *pkt, uint16_t port_id); +int lb_arp_find(uint32_t ip, struct ether_addr *mac, uint16_t port_id); + +#endif + diff --git a/core/lb_clock.h b/core/lb_clock.h new file mode 100644 index 0000000..58ed689 --- /dev/null +++ b/core/lb_clock.h @@ -0,0 +1,27 @@ +#ifndef __LB_CLOCK_H__ +/* Copyright (c) 2018. TIG developer. */ + +#define __LB_CLOCK_H__ + +#include +#include + +extern rte_atomic32_t lb_clock; + +#define LB_CLOCK_HZ (100) + +#define LB_CLOCK_PER_S LB_CLOCK_HZ + +#define LB_CLOCK() ((uint32_t)rte_atomic32_read(&lb_clock)) + +/* MS_PER_S defined in rte_cycles.h */ +#define MS_TO_CYCLES(a) ((rte_get_timer_hz() + MS_PER_S - 1) / MS_PER_S * (a)) + +#define SEC_TO_CYCLES(a) (rte_get_timer_hz() * (a)) + +#define SEC_TO_LB_CLOCK(a) (LB_CLOCK_PER_S * (a)) + +#define LB_CLOCK_TO_SEC(a) (((a) + LB_CLOCK_PER_S - 1) / LB_CLOCK_PER_S) + +#endif + diff --git a/core/lb_config.c b/core/lb_config.c new file mode 100644 index 0000000..66ec684 --- /dev/null +++ b/core/lb_config.c @@ -0,0 +1,421 @@ +/* Copyright (c) 2018. TIG developer. */ + +#include +#include + +#include +#include +#include + +#include "lb_config.h" +#include "lb_parser.h" + +struct lb_conf *lb_cfg; + +struct conf_entry { + const char *name; + int required; + int (*parse)(const char *, void *); +}; + +static int +dpdk_entry_parse_argv(const char *token, void *_conf) { + struct lb_dpdk_conf *conf = _conf; + char *argv_str, *p; + char **argv; + int argc = 0; + + argv = conf->argv; + argv[argc] = strdup("jupiter-service"); + if (argv[argc] == NULL) + return -1; + argc++; + + argv_str = strdup(token); + if (argv_str == NULL) + return -1; + p = strtok(argv_str, " "); + while (p != NULL) { + if (argc == LB_MAX_DPDK_ARGS) + break; + argv[argc] = strdup(p); + if (argv[argc] == NULL) + return -1; + argc++; + p = strtok(NULL, " "); + } + conf->argc = argc; + free(argv_str); + return 0; +} + +static const struct conf_entry dpdk_entries[] = { + { + .name = "argv", + .required = 1, + .parse = dpdk_entry_parse_argv, + }, +}; + +static int +device_entry_parse_name(const char *token, void *_conf) { + struct lb_device_conf *conf = _conf; + char *name; + + name = conf->name; + snprintf(name, RTE_KNI_NAMESIZE, "%s", token); + return 0; +} + +static int +device_entry_parse_mode(const char *token, void *_conf) { + struct lb_device_conf *conf = _conf; + uint32_t mode; + + if (strcmp(token, "rr") == 0) + mode = BONDING_MODE_ROUND_ROBIN; + else if (strcmp(token, "active-backup") == 0) + mode = BONDING_MODE_ACTIVE_BACKUP; + else + return -1; + + conf->mode = mode; + return 0; +} + +static int +device_entry_parse_ipv4(const char *token, void *_conf) { + struct lb_device_conf *conf = _conf; + uint32_t addr; + + if (parse_ipv4_addr(token, (struct in_addr *)&addr) < 0) + return -1; + + conf->ipv4 = addr; + return 0; +} + +static int +device_entry_parse_netmask(const char *token, void *_conf) { + struct lb_device_conf *conf = _conf; + uint32_t addr; + + if (parse_ipv4_addr(token, (struct in_addr *)&addr) < 0) + return -1; + + conf->netmask = addr; + return 0; +} + +static int +device_entry_parse_gw(const char *token, void *_conf) { + struct lb_device_conf *conf = _conf; + uint32_t addr; + + if (parse_ipv4_addr(token, (struct in_addr *)&addr) < 0) + return -1; + + conf->gw = addr; + return 0; +} + +static int +device_entry_parse_rxqsize(const char *token, void *_conf) { + struct lb_device_conf *conf = _conf; + uint16_t size; + + if (parser_read_uint16(&size, token) < 0) + return -1; + + conf->rxqsize = size; + return 0; +} + +static int +device_entry_parse_txqsize(const char *token, void *_conf) { + struct lb_device_conf *conf = _conf; + uint16_t size; + + if (parser_read_uint16(&size, token) < 0) + return -1; + + conf->txqsize = size; + return 0; +} + +static int +device_entry_parse_mtu(const char *token, void *_conf) { + struct lb_device_conf *conf = _conf; + uint16_t size; + + if (parser_read_uint16(&size, token) < 0) + return -1; + + conf->mtu = size; + return 0; +} + +static int +device_entry_parse_rxoffload(const char *token, void *_conf) { + struct lb_device_conf *conf = _conf; + uint32_t size; + + if (parser_read_uint32_hex(&size, token) < 0) + return -1; + + conf->rxoffload = size; + return 0; +} + +static int +device_entry_parse_txoffload(const char *token, void *_conf) { + struct lb_device_conf *conf = _conf; + uint32_t size; + + if (parser_read_uint32_hex(&size, token) < 0) + return -1; + + conf->txoffload = size; + return 0; +} + +static int +device_entry_parse_local_ipv4(const char *token, void *_conf) { + struct lb_device_conf *conf = _conf; + char *addr_str, *p; + uint32_t *lips; + int j = 0; + uint32_t a, b, c, d, e; + uint32_t i; + uint32_t addr, netmask, num; + + lips = conf->lips; + addr_str = strdup(token); + if (addr_str == NULL) + return -1; + p = strtok(addr_str, " ,"); + while (p != NULL) { + if (sscanf(p, "%u.%u.%u.%u/%u", &a, &b, &c, &d, &e) != 5) + return -1; + if (a > 255 || b > 255 || c > 255 || d > 255 || e > 32 || e == 0) + return -1; + /* e = 30 + * (1 << 31): + * 1000 0000 0000 0000 0000 0000 0000 0000 + * (1 << 31) >> (30 - 1): + * 1111 1111 1111 1111 1111 1111 1111 1100 + */ + netmask = (1 << 31) >> (e - 1); + addr = IPv4(a, b, c, d) & netmask; + num = 1 << (32 - e); + for (i = 0; i < num; i++) { + if (j == LB_MAX_LADDR) + goto end; + lips[j++] = rte_cpu_to_be_32(addr + i); + } + p = strtok(NULL, " ,"); + } + +end: + conf->nb_lips = j; + free(addr_str); + return 0; +} + +static int +device_entry_parse_pci(const char *token, void *_conf) { + struct lb_device_conf *conf = _conf; + char *pci_str, *p; + struct rte_pci_addr *pci_addrs; + int i = 0; + + pci_addrs = conf->pcis; + pci_str = strdup(token); + if (pci_str == NULL) + return -1; + p = strtok(pci_str, " ,"); + while (p != NULL) { + if (i == RTE_MAX_ETHPORTS) + break; + if (rte_pci_addr_parse(p, &pci_addrs[i++]) < 0) + return -1; + p = strtok(NULL, " ,"); + } + conf->nb_pcis = i; + free(pci_str); + return 0; +} + +static const struct conf_entry device_entries[] = { + { + .name = "name", + .required = 1, + .parse = device_entry_parse_name, + }, + { + .name = "mode", + .required = 0, + .parse = device_entry_parse_mode, + }, + { + .name = "ipv4", + .required = 1, + .parse = device_entry_parse_ipv4, + }, + { + .name = "netmask", + .required = 1, + .parse = device_entry_parse_netmask, + }, + { + .name = "gw", + .required = 1, + .parse = device_entry_parse_gw, + }, + { + .name = "rxqsize", + .required = 1, + .parse = device_entry_parse_rxqsize, + }, + { + .name = "txqsize", + .required = 1, + .parse = device_entry_parse_txqsize, + }, + { + .name = "mtu", + .required = 1, + .parse = device_entry_parse_mtu, + }, + { + .name = "rxoffload", + .required = 0, + .parse = device_entry_parse_rxoffload, + }, + { + .name = "txoffload", + .required = 0, + .parse = device_entry_parse_txoffload, + }, + { + .name = "local-ipv4", + .required = 1, + .parse = device_entry_parse_local_ipv4, + }, + { + .name = "pci", + .required = 1, + .parse = device_entry_parse_pci, + }, +}; + +static int +dpdk_section_parse(struct rte_cfgfile *cfgfile, const char *section, + struct lb_dpdk_conf *conf) { + const char *val; + uint32_t j; + + for (j = 0; j < RTE_DIM(dpdk_entries); j++) { + val = rte_cfgfile_get_entry(cfgfile, section, dpdk_entries[j].name); + if (val == NULL) { + if (dpdk_entries[j].required) { + printf("%s(): %s is required in section %s.\n", __func__, + dpdk_entries[j].name, section); + return -1; + } + } else { + if (dpdk_entries[j].parse(val, conf) < 0) { + printf("%s(): Cannot parse %s in section %s.\n", __func__, + dpdk_entries[j].name, section); + return -1; + } + } + } + return 0; +} + +static int +device_section_parse(struct rte_cfgfile *cfgfile, const char *section, + struct lb_device_conf *conf) { + const char *val; + uint32_t j; + + for (j = 0; j < RTE_DIM(device_entries); j++) { + val = rte_cfgfile_get_entry(cfgfile, section, device_entries[j].name); + if (val == NULL) { + if (device_entries[j].required) { + printf("%s(): %s is required in section %s.\n", __func__, + device_entries[j].name, section); + return -1; + } + } else { + if (device_entries[j].parse(val, conf) < 0) { + printf("%s(): Cannot parse %s in section %s.\n", __func__, + device_entries[j].name, section); + return -1; + } + } + } + return 0; +} + +int +lb_config_file_load(const char *cfgfile_path) { + struct rte_cfgfile *cfgfile; + int i; + char **sections; + int num_sections; + + cfgfile = rte_cfgfile_load(cfgfile_path, 0); + if (cfgfile == NULL) { + printf("%s(): Load config file %s failed.\n", __func__, cfgfile_path); + return -1; + } + + num_sections = rte_cfgfile_num_sections(cfgfile, "", 0); + if (num_sections == 0) { + printf("%s(): There is no sections in config file.\n", __func__); + return -1; + } + + sections = malloc(num_sections * sizeof(char *)); + if (sections == NULL) { + printf("%s(): Alloc memory failed.\n", __func__); + return -1; + } + for (i = 0; i < num_sections; i++) { + sections[i] = malloc(CFG_NAME_LEN); + if (sections[i] == NULL) { + printf("%s(): Alloc memory failed.\n", __func__); + return -1; + } + } + + lb_cfg = malloc(sizeof(struct lb_conf)); + if (lb_cfg == NULL) { + printf("%s(): Alloc memory for lb_conf failed.\n", __func__); + return -1; + } + memset(lb_cfg, 0, sizeof(*lb_cfg)); + + num_sections = rte_cfgfile_sections(cfgfile, sections, num_sections); + for (i = 0; i < num_sections; i++) { + int rc = 0; + + if (strncmp(sections[i], "DEVICE", 6) == 0) + rc = device_section_parse(cfgfile, sections[i], + &lb_cfg->devices[lb_cfg->nb_decices++]); + else if (strcmp(sections[i], "DPDK") == 0) + rc = dpdk_section_parse(cfgfile, sections[i], &lb_cfg->dpdk); + + if (rc < 0) { + printf("%s(): Cannot parse section %s.\n", __func__, sections[i]); + return -1; + } + } + + rte_cfgfile_close(cfgfile); + + return 0; +} + diff --git a/core/lb_config.h b/core/lb_config.h new file mode 100644 index 0000000..2c8868f --- /dev/null +++ b/core/lb_config.h @@ -0,0 +1,45 @@ +/* Copyright (c) 2018. TIG developer. */ + +#ifndef __LB_CONFIG_H__ +#define __LB_CONFIG_H__ + +#include +#include + +#define LB_MAX_LADDR 256 + +struct lb_device_conf { + char name[RTE_KNI_NAMESIZE]; + uint32_t mode; + uint32_t ipv4; + uint32_t netmask; + uint32_t gw; + uint16_t rxqsize, txqsize; + uint16_t mtu; + uint32_t rxoffload; + uint32_t txoffload; + uint32_t nb_lips; + uint32_t lips[LB_MAX_LADDR]; + uint16_t nb_pcis; + struct rte_pci_addr pcis[RTE_MAX_ETHPORTS]; +}; + +#define LB_MAX_DPDK_ARGS 128 + +struct lb_dpdk_conf { + char *argv[LB_MAX_DPDK_ARGS]; + int argc; +}; + +struct lb_conf { + struct lb_device_conf devices[RTE_MAX_ETHPORTS]; + uint16_t nb_decices; + struct lb_dpdk_conf dpdk; +}; + +extern struct lb_conf *lb_cfg; + +int lb_config_file_load(const char *cfgfile_path); + +#endif + diff --git a/core/lb_conn.c b/core/lb_conn.c new file mode 100644 index 0000000..46873ce --- /dev/null +++ b/core/lb_conn.c @@ -0,0 +1,223 @@ +/* Copyright (c) 2018. TIG developer. */ + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "lb_clock.h" +#include "lb_conn.h" +#include "lb_proto.h" +#include "lb_service.h" + +#define CONN_TIMER_CYCLE MS_TO_CYCLES(10) + +struct lb_conn * +lb_conn_new(struct lb_conn_table *ct, uint32_t cip, uint32_t cport, + struct lb_real_service *rs, uint8_t is_synproxy, uint16_t port_id) { + struct lb_conn *conn; + struct ipv4_4tuple tuple; + int rc; + + rc = rte_mempool_get(ct->mp, (void **)&conn); + if (rc < 0) { + return NULL; + } + + rc = lb_laddr_get(port_id, ct->type, &conn->laddr, &conn->lport); + if (rc < 0) { + rte_mempool_put(ct->mp, conn); + return NULL; + } + + conn->ct = ct; + conn->lip = conn->laddr->ipv4; + conn->cip = cip; + conn->cport = cport; + conn->vip = rs->virt_service->vip; + conn->vport = rs->virt_service->vport; + conn->rip = rs->rip; + conn->rport = rs->rport; + + conn->use_time = LB_CLOCK(); + conn->timeout = ct->timeout; + + conn->real_service = rs; + conn->flags = 0; + if (rs->virt_service->flags & LB_VS_F_TOA) + conn->flags |= LB_CONN_F_TOA; + + if (is_synproxy) { + conn->flags |= LB_CONN_F_SYNPROXY; + conn->proxy.syn_mbuf = NULL; + conn->proxy.ack_mbuf = NULL; + conn->proxy.isn = 0; + conn->proxy.oft = 0; + conn->proxy.syn_retry = 5; + } + + conn->tseq.isn = 0; + conn->tseq.oft = 0; + + IPv4_4TUPLE(&tuple, conn->cip, conn->cport, conn->vip, conn->vport); + rc = rte_hash_add_key_data(ct->hash, (const void *)&tuple, conn); + if (rc < 0) { + lb_laddr_put(conn->laddr, conn->lport, ct->type); + rte_mempool_put(ct->mp, conn); + return NULL; + } + + IPv4_4TUPLE(&tuple, conn->rip, conn->rport, conn->lip, conn->lport); + rc = rte_hash_add_key_data(ct->hash, (const void *)&tuple, conn); + if (rc < 0) { + IPv4_4TUPLE(&tuple, conn->cip, conn->cport, conn->vip, conn->vport); + rte_hash_del_key(ct->hash, (const void *)&tuple); + lb_laddr_put(conn->laddr, conn->lport, ct->type); + rte_mempool_put(ct->mp, conn); + return NULL; + } + + rte_spinlock_lock(&ct->spinlock); + TAILQ_INSERT_TAIL(&ct->timeout_list, conn, next); + rte_spinlock_unlock(&ct->spinlock); + + return conn; +} + +struct lb_conn * +lb_conn_find(struct lb_conn_table *ct, uint32_t sip, uint32_t dip, + uint16_t sport, uint16_t dport, uint8_t *dir) { + struct lb_conn *conn; + struct ipv4_4tuple tuple; + int rc; + + IPv4_4TUPLE(&tuple, sip, sport, dip, dport); + rc = rte_hash_lookup_data(ct->hash, (const void *)&tuple, (void **)&conn); + if (rc < 0) { + *dir = LB_DIR_ORIGINAL; + return NULL; + } + + conn->use_time = LB_CLOCK(); + + if (conn->cip == sip && conn->cport == sport) + *dir = LB_DIR_ORIGINAL; + else + *dir = LB_DIR_REPLY; + + return conn; +} + +static void +__conn_expire(struct lb_conn_table *ct, struct lb_conn *conn) { + struct ipv4_4tuple tuple; + + if (conn->flags & LB_CONN_F_SYNPROXY) { + rte_pktmbuf_free(conn->proxy.syn_mbuf); + rte_pktmbuf_free(conn->proxy.ack_mbuf); + } + + if (conn->flags & LB_CONN_F_ACTIVE) { + rte_atomic32_add(&conn->real_service->active_conns, -1); + rte_atomic32_add(&conn->real_service->virt_service->active_conns, -1); + } + + IPv4_4TUPLE(&tuple, conn->cip, conn->cport, conn->vip, conn->vport); + rte_hash_del_key(ct->hash, (const void *)&tuple); + + IPv4_4TUPLE(&tuple, conn->lip, conn->lport, conn->rip, conn->rport); + rte_hash_del_key(ct->hash, (const void *)&tuple); + + lb_laddr_put(conn->laddr, conn->lport, ct->type); + lb_vs_put_rs(conn->real_service); + rte_mempool_put(ct->mp, conn); + + TAILQ_REMOVE(&ct->timeout_list, conn, next); +} + +void +lb_conn_expire(struct lb_conn_table *ct, struct lb_conn *conn) { + rte_spinlock_lock(&ct->spinlock); + __conn_expire(ct, conn); + rte_spinlock_unlock(&ct->spinlock); +} + +static void +conn_table_expire_cb(__attribute((unused)) struct rte_timer *timer, void *arg) { + struct lb_conn_table *ct = arg; + struct lb_conn *conn; + void *tmp; + uint32_t curr_time; + + curr_time = LB_CLOCK(); + rte_spinlock_lock(&ct->spinlock); + for_each_conn_safe(conn, &ct->timeout_list, next, tmp) { + if (ct->timer_task_cb) + ct->timer_task_cb(conn); + if (ct->timer_expire_cb && + (ct->timer_expire_cb(conn, curr_time) == 0)) { + __conn_expire(ct, conn); + } + } + rte_spinlock_unlock(&ct->spinlock); +} + +int +lb_conn_table_init(struct lb_conn_table *ct, enum lb_proto_type type, + uint32_t lcore_id, uint32_t timeout, + void (*task_cb)(struct lb_conn *), + int (*expire_cb)(struct lb_conn *, uint32_t)) { + struct rte_hash_parameters param; + char name[RTE_HASH_NAMESIZE]; + uint32_t socket_id; + + socket_id = rte_lcore_to_socket_id(lcore_id); + + ct->type = type; + + memset(¶m, 0, sizeof(param)); + snprintf(name, sizeof(name), "ct_hash%p", ct); + param.name = name; + param.entries = LB_MAX_CONN * 2; + param.key_len = sizeof(struct ipv4_4tuple); + param.hash_func = rte_hash_crc; + param.socket_id = socket_id; + + ct->hash = rte_hash_create(¶m); + if (ct->hash == NULL) { + RTE_LOG(ERR, USER1, "%s(): Create hash table %s failed, %s.\n", + __func__, name, rte_strerror(rte_errno)); + return -1; + } + + snprintf(name, sizeof(name), "ct_mp%p", ct); + ct->mp = rte_mempool_create(name, LB_MAX_CONN, sizeof(struct lb_conn), 0, 0, + NULL, NULL, NULL, NULL, socket_id, + MEMPOOL_F_SP_PUT | MEMPOOL_F_SC_GET); + if (ct->mp == NULL) { + RTE_LOG(ERR, USER1, "%s(): Create mempool %s failed, %s\n", __func__, + name, rte_strerror(rte_errno)); + return -1; + } + + TAILQ_INIT(&ct->timeout_list); + ct->timeout = timeout; + ct->timer_task_cb = task_cb; + ct->timer_expire_cb = expire_cb; + rte_timer_init(&ct->timer); + rte_timer_reset(&ct->timer, CONN_TIMER_CYCLE, PERIODICAL, lcore_id, + conn_table_expire_cb, ct); + rte_spinlock_init(&ct->spinlock); + + return 0; +} + diff --git a/core/lb_conn.h b/core/lb_conn.h new file mode 100644 index 0000000..e4cc9da --- /dev/null +++ b/core/lb_conn.h @@ -0,0 +1,102 @@ +/* Copyright (c) 2018. TIG developer. */ + +#ifndef __LB_CONN_H__ +#define __LB_CONN_H__ + +#include + +#include +#include +#include +#include + +#include "lb_device.h" +#include "lb_proto.h" +#include "lb_service.h" +#include "lb_synproxy.h" +#include "lb_tcp_secret_seq.h" + +#define LB_MAX_CONN (1 << 20) + +#define LB_CONN_F_SYNPROXY (0x01) +#define LB_CONN_F_ACTIVE (0x02) +#define LB_CONN_F_TOA (0x4) + +struct ipv4_4tuple { + uint32_t sip, dip; + uint16_t sport, dport; +} __attribute__((__packed__)); + +#define IPv4_4TUPLE(t, si, sp, di, dp) \ + do { \ + (t)->sip = si; \ + (t)->sport = sp; \ + (t)->dip = di; \ + (t)->dport = dp; \ + } while (0) + +struct lb_conn { + TAILQ_ENTRY(lb_conn) next; + + struct lb_conn_table *ct; + + uint32_t cip, vip, lip, rip; + uint16_t cport, vport, lport, rport; + + uint32_t timeout; + uint32_t create_time; + uint32_t use_time; + + struct rte_timer timer; + + struct lb_real_service *real_service; + struct lb_laddr *laddr; + + uint32_t flags; + uint32_t state; + + struct synproxy proxy; + + /* tcp seq adjust */ + struct tcp_secret_seq tseq; +}; + +struct lb_conn_table { + enum lb_proto_type type; + struct rte_hash *hash; + struct rte_mempool *mp; + uint32_t timeout; + rte_spinlock_t spinlock; + TAILQ_HEAD(, lb_conn) timeout_list; + struct rte_timer timer; + int (*timer_expire_cb)(struct lb_conn *, uint32_t); + void (*timer_task_cb)(struct lb_conn *); + struct { + uint64_t syn; + uint64_t vip; + uint64_t mp; + uint64_t hash; + uint64_t laddr; + uint64_t sched; + uint64_t rs; + } drop_stats; +}; + +#define for_each_conn_safe(var, head, field, tvar) \ + for ((var) = TAILQ_FIRST((head)); \ + (var) && ((tvar) = TAILQ_NEXT((var), field), 1); (var) = (tvar)) + +struct lb_conn *lb_conn_new(struct lb_conn_table *ct, uint32_t cip, + uint32_t cport, struct lb_real_service *rs, + uint8_t is_synproxy, uint16_t port_id); +void lb_conn_expire(struct lb_conn_table *ct, struct lb_conn *conn); +struct lb_conn *lb_conn_find(struct lb_conn_table *ct, uint32_t sip, + uint32_t dip, uint16_t sport, uint16_t dport, + uint8_t *dir); +int lb_conn_table_init(struct lb_conn_table *ct, enum lb_proto_type type, + uint32_t lcore_id, uint32_t timeout, + void (*task_cb)(struct lb_conn *), + int (*expire_cb)(struct lb_conn *, uint32_t)); + +#endif + diff --git a/core/lb_cql.c b/core/lb_cql.c new file mode 100644 index 0000000..40f9efc --- /dev/null +++ b/core/lb_cql.c @@ -0,0 +1,177 @@ +/* Copyright (c) 2017. TIG developer. */ + +#include +#include + +#include +#include +#include +#include +#include + +#include "lb_cql.h" + +struct cql_entry { + TAILQ_ENTRY(cql_entry) next; + uint64_t time; +}; + +struct cql_rule { + uint32_t ip; + uint32_t qps; + rte_spinlock_t lock; + TAILQ_HEAD(, cql_entry) tbl; + struct cql_entry entries[0]; +}; + +struct lb_cql { + struct rte_hash *h; + uint32_t size; + uint32_t socket_id; +}; + +static struct cql_rule * +cql_rule_create(uint32_t ip, uint32_t qps, uint32_t socket_id) { + struct cql_rule *r; + uint32_t size; + uint32_t i; + + size = sizeof(struct cql_rule) + qps * sizeof(struct cql_entry); + r = rte_zmalloc_socket(NULL, size, RTE_CACHE_LINE_SIZE, socket_id); + if (!r) { + return NULL; + } + r->ip = ip; + r->qps = qps; + rte_spinlock_init(&r->lock); + TAILQ_INIT(&r->tbl); + for (i = 0; i < qps; i++) { + TAILQ_INSERT_TAIL(&r->tbl, &r->entries[i], next); + } + return r; +} + +static void +cql_rule_free(struct cql_rule *r) { + rte_free(r); +} + +static int +cql_rule_enqueue(struct cql_rule *r, uint64_t time) { + struct cql_entry *e; + + rte_spinlock_lock(&r->lock); + e = TAILQ_FIRST(&r->tbl); + if (e && (e->time + rte_get_tsc_hz() < time)) { + e->time = time; + TAILQ_REMOVE(&r->tbl, e, next); + TAILQ_INSERT_TAIL(&r->tbl, e, next); + rte_spinlock_unlock(&r->lock); + return 0; + } + rte_spinlock_unlock(&r->lock); + return -1; +} + +static void +__cql_rule_del(struct lb_cql *cql, uint32_t ip) { + struct cql_rule *r; + + if (rte_hash_lookup_data(cql->h, (const void *)&ip, (void **)&r) >= 0) { + rte_hash_del_key(cql->h, (const void *)&ip); + cql_rule_free(r); + } +} + +int +lb_cql_rule_add(struct lb_cql *cql, uint32_t ip, uint32_t qps) { + struct cql_rule *r; + + r = cql_rule_create(ip, qps, cql->socket_id); + if (!r) { + return -1; + } + __cql_rule_del(cql, ip); + if (rte_hash_add_key_data(cql->h, (const void *)&ip, r) < 0) { + cql_rule_free(r); + return -1; + } + return 0; +} + +void +lb_cql_rule_del(struct lb_cql *cql, uint32_t ip) { + __cql_rule_del(cql, ip); +} + +int +lb_cql_rule_iterate(struct lb_cql *cql, uint32_t *ip, uint32_t *qps, + uint32_t *next) { + uint32_t *k; + struct cql_rule *r; + int pos; + + pos = rte_hash_iterate(cql->h, (const void **)&k, (void **)&r, next); + if (pos >= 0) { + *ip = r->ip; + *qps = r->qps; + } + return pos; +} + +int +lb_cql_check(struct lb_cql *cql, uint32_t ip, uint64_t time) { + struct cql_rule *r; + + if (rte_hash_lookup_data(cql->h, (const void **)&ip, (void **)&r) < 0) { + return 0; + } + return cql_rule_enqueue(r, time); +} + +uint32_t +lb_cql_size(struct lb_cql *cql) { + return cql->size; +} + +struct lb_cql * +lb_cql_create(const char *name, uint32_t size, uint32_t socket_id) { + struct lb_cql *cql; + struct rte_hash_parameters params = {0}; + + cql = rte_zmalloc_socket(NULL, sizeof(struct lb_cql), 0, socket_id); + if (!cql) { + return NULL; + } + cql->size = size; + cql->socket_id = socket_id; + + params.name = name; + params.entries = size; + params.key_len = sizeof(uint32_t); + params.hash_func = rte_hash_crc; + params.socket_id = socket_id; + cql->h = rte_hash_create(¶ms); + if (!cql->h) { + rte_free(cql); + return NULL; + } + return cql; +} + +void +lb_cql_destory(struct lb_cql *cql) { + const void *k; + struct cql_rule *r; + uint32_t n = 0; + + if (!cql) { + return; + } + while (rte_hash_iterate(cql->h, &k, (void **)&r, &n) >= 0) { + cql_rule_free(r); + } + rte_hash_free(cql->h); + rte_free(cql); +} + diff --git a/core/lb_cql.h b/core/lb_cql.h new file mode 100644 index 0000000..b2cf45f --- /dev/null +++ b/core/lb_cql.h @@ -0,0 +1,20 @@ +/* Copyright (c) 2017. TIG developer. */ + +/* CQL: client query limit */ + +#ifndef __LB_CQL_H__ +#define __LB_CQL_H__ + +struct lb_cql; + +int lb_cql_rule_add(struct lb_cql *cql, uint32_t ip, uint32_t qps); +void lb_cql_rule_del(struct lb_cql *cql, uint32_t ip); +int lb_cql_rule_iterate(struct lb_cql *cql, uint32_t *ip, uint32_t *qps, + uint32_t *next); +int lb_cql_check(struct lb_cql *cql, uint32_t ip, uint64_t time); +uint32_t lb_cql_size(struct lb_cql *cql); +struct lb_cql *lb_cql_create(const char *name, uint32_t size, uint32_t socket_id); +void lb_cql_destory(struct lb_cql *cql); + +#endif + diff --git a/core/lb_device.c b/core/lb_device.c new file mode 100644 index 0000000..8e7478b --- /dev/null +++ b/core/lb_device.c @@ -0,0 +1,1051 @@ +/* Copyright (c) 2018. TIG developer. */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "lb_device.h" +#include "lb_format.h" +#include "lb_parser.h" + +struct lb_device lb_devices[RTE_MAX_ETHPORTS]; + +static int +__fdir_filter_input_set(uint16_t port_id, uint32_t flow_type, + enum rte_eth_input_set_field filed, + enum rte_filter_input_set_op op) { + struct rte_eth_fdir_filter_info info; + + memset(&info, 0, sizeof(info)); + info.info_type = RTE_ETH_FDIR_FILTER_INPUT_SET_SELECT; + info.info.input_set_conf.flow_type = flow_type; + info.info.input_set_conf.field[0] = filed; + info.info.input_set_conf.inset_size = 1; + info.info.input_set_conf.op = op; + info.info.input_set_conf.op = RTE_ETH_INPUT_SET_ADD; + return rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_FDIR, + RTE_ETH_FILTER_SET, &info); +} + +static int +_fdir_filter_input_set(uint16_t port_id) { + int rc = 0; + + rc += __fdir_filter_input_set(port_id, RTE_ETH_FLOW_NONFRAG_IPV4_TCP, + RTE_ETH_INPUT_SET_NONE, + RTE_ETH_INPUT_SET_SELECT); + rc += __fdir_filter_input_set(port_id, RTE_ETH_FLOW_NONFRAG_IPV4_TCP, + RTE_ETH_INPUT_SET_L3_DST_IP4, + RTE_ETH_INPUT_SET_ADD); + rc += __fdir_filter_input_set(port_id, RTE_ETH_FLOW_NONFRAG_IPV4_UDP, + RTE_ETH_INPUT_SET_NONE, + RTE_ETH_INPUT_SET_SELECT); + rc += __fdir_filter_input_set(port_id, RTE_ETH_FLOW_NONFRAG_IPV4_UDP, + RTE_ETH_INPUT_SET_L3_DST_IP4, + RTE_ETH_INPUT_SET_ADD); + return rc; +} + +static int +_fdir_filter_add(uint16_t port_id, uint32_t flow_type, uint32_t dst_ip, + uint32_t rxq_id) { + struct rte_eth_fdir_filter fdir; + + memset(&fdir, 0, sizeof(fdir)); + fdir.input.flow_type = flow_type; + fdir.input.flow.tcp4_flow.ip.dst_ip = dst_ip; + fdir.action.rx_queue = rxq_id; + return rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_FDIR, + RTE_ETH_FILTER_ADD, &fdir); +} + +static int +dpdk_dev_fdir_filter_add(uint16_t port_id, uint32_t dst_ip, uint32_t rxq_id) { + static uint8_t input_set_once[RTE_MAX_ETHPORTS] = {0}; + int rc; + + if (!input_set_once[port_id]) { + rc = _fdir_filter_input_set(port_id); + if (rc < 0) { + RTE_LOG(WARNING, USER1, + "%s(): Unsupport FDIR input set configuration, %s.\n", + __func__, lb_devices[port_id].name); + } + input_set_once[port_id] = 1; + } + + rc = _fdir_filter_add(port_id, RTE_ETH_FLOW_NONFRAG_IPV4_TCP, dst_ip, + rxq_id); + if (rc < 0) { + RTE_LOG(ERR, USER1, + "%s(): Add FDIR fileter on device %s failed, " + "type:NONFRAG_IPV4_TCP, dst-ip:" IPv4_BE_FMT ", rxq:%u\n", + __func__, lb_devices[port_id].name, IPv4_BE_ARG(dst_ip), + rxq_id); + return rc; + } + rc = _fdir_filter_add(port_id, RTE_ETH_FLOW_NONFRAG_IPV4_UDP, dst_ip, + rxq_id); + if (rc < 0) { + RTE_LOG(ERR, USER1, + "%s(): Add FDIR fileter on device %s failed, " + "type:NONFRAG_IPV4_UDP, dst-ip:" IPv4_BE_FMT ", rxq:%u\n", + __func__, lb_devices[port_id].name, IPv4_BE_ARG(dst_ip), + rxq_id); + return rc; + } + RTE_LOG(INFO, USER1, + "%s(): Add FDIR filter on device %s, " + "type:NONFRAG_IPV4_TCP|NONFRAG_IPV4_UDP, dst-ip:" IPv4_BE_FMT + ", rxq:%u\n", + __func__, lb_devices[port_id].name, IPv4_BE_ARG(dst_ip), rxq_id); + return 0; +} + +static int +dpdk_dev_5tuple_filter_add(uint16_t port_id, uint32_t dst_ip, uint32_t rxq_id) { + struct rte_eth_ntuple_filter ntuple; + int rc; + + memset(&ntuple, 0, sizeof(ntuple)); + ntuple.flags = RTE_5TUPLE_FLAGS; + ntuple.dst_ip = dst_ip; + ntuple.dst_ip_mask = UINT32_MAX; + ntuple.priority = 1; + ntuple.queue = rxq_id; + rc = rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_NTUPLE, + RTE_ETH_FILTER_ADD, &ntuple); + if (rc < 0) { + RTE_LOG(ERR, USER1, "%s(): Device %s add 5Tuple filter failed.\n", + __func__, lb_devices[port_id].name); + return rc; + } + RTE_LOG(INFO, USER1, + "%s(): Add 5Tuple filter, dst-ip:" IPv4_BE_FMT ", rxq:%u\n", + __func__, IPv4_BE_ARG(dst_ip), rxq_id); + return rc; +} + +static int +dpdk_dev_filter_add(uint16_t port_id, uint32_t dst_ip, uint32_t rxq_id) { + if (rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_NTUPLE) == 0) { + return dpdk_dev_5tuple_filter_add(port_id, dst_ip, rxq_id); + } + + RTE_LOG(ERR, USER1, "%s(): Device %s does not support 5Tuple filter.\n", + __func__, lb_devices[port_id].name); + + if (rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_FDIR) == 0) { + return dpdk_dev_fdir_filter_add(port_id, dst_ip, rxq_id); + } + + RTE_LOG(ERR, USER1, "%s(): Device %s does not support FDIR filter.\n", + __func__, lb_devices[port_id].name); + + return -1; +} + +static int +kni_get_mac(const char *name, struct ether_addr *ha) { + int fd; + struct ifreq req; + + fd = socket(AF_INET, SOCK_STREAM, 0); + if (fd < 0) { + RTE_LOG(ERR, USER1, "%s(): Create SOCK_STREAM socket failed, %s\n", + __func__, strerror(errno)); + return -1; + } + + /* Get KNI MAC */ + memset(&req, 0, sizeof(struct ifreq)); + strncpy(req.ifr_name, name, IFNAMSIZ); + req.ifr_addr.sa_family = AF_INET; + + if (ioctl(fd, SIOCGIFHWADDR, &req) < 0) { + RTE_LOG(ERR, USER1, "%s(): Set MAC failed, %s\n", __func__, + strerror(errno)); + close(fd); + return -1; + } + + close(fd); + memcpy(ha->addr_bytes, req.ifr_hwaddr.sa_data, ETHER_ADDR_LEN); + + return 0; +} + +static void +tx_buffer_callback(struct rte_mbuf **pkts, uint16_t unsend, void *userdata) { + uint16_t i; + struct lb_device *dev = userdata; + + for (i = 0; i < unsend; i++) { + rte_pktmbuf_free(pkts[i]); + } + dev->lcore_stats[rte_lcore_id()].tx_dropped += unsend; +} + +static struct rte_ring * +l4_ports_create(const char *name, uint16_t min, uint16_t max, + uint32_t socket_id) { + struct rte_ring *r; + uint16_t p; + + r = rte_ring_create(name, UINT16_MAX + 1, socket_id, + RING_F_SP_ENQ | RING_F_SC_DEQ); + if (r == NULL) { + RTE_LOG(ERR, USER1, "%s(): Create ports ring %s failed, %s.\n", + __func__, name, rte_strerror(rte_errno)); + return NULL; + } + for (p = min; p != max; p++) { + rte_ring_sp_enqueue(r, (void *)(uintptr_t)rte_cpu_to_be_16(p)); + } + return r; +} + +static int +laddr_init(int port_id) { + struct lb_device *dev; + uint32_t i, socket_id, lcore_id; + struct lb_laddr_list *laddr_list; + struct lb_laddr *laddr; + char name[RTE_RING_NAMESIZE]; + int rc; + + dev = &lb_devices[port_id]; + socket_id = rte_eth_dev_socket_id(port_id); + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + laddr_list = &dev->laddr_list[lcore_id]; + for (i = 0; i < laddr_list->nb; i++) { + laddr = &laddr_list->entries[i]; + + snprintf(name, sizeof(name), "tcpport%p", laddr); + laddr->ports[LB_IPPROTO_TCP] = l4_ports_create( + name, LB_MIN_L4_PORT, LB_MAX_L4_PORT, socket_id); + + snprintf(name, sizeof(name), "udpport%p", laddr); + laddr->ports[LB_IPPROTO_UDP] = l4_ports_create( + name, LB_MIN_L4_PORT, LB_MAX_L4_PORT, socket_id); + + if (laddr->ports[LB_IPPROTO_TCP] == NULL || + laddr->ports[LB_IPPROTO_UDP] == NULL) { + RTE_LOG(ERR, USER1, "%s(): l4_ports_create failed.\n", + __func__); + return -1; + } + + rc = dpdk_dev_filter_add(port_id, laddr->ipv4, laddr->rxq_id); + if (rc < 0) { + RTE_LOG(ERR, USER1, "%s(): dpdk_dev_filter_add failed.\n", + __func__); + return rc; + } + } + } + return 0; +} + +static int +dpdk_device_init(int port_id) { + struct lb_device *dev; + struct rte_eth_dev_info info; + struct rte_eth_conf dev_conf; + uint16_t i; + uint32_t socket_id; + uint32_t mp_size; + char mp_name[RTE_MEMPOOL_NAMESIZE]; + struct rte_kni_conf kni_conf; + struct rte_kni_ops kni_ops; + int rc; + + dev = &lb_devices[port_id]; + + /* 0) Get device hardware info. */ + rte_eth_dev_info_get(port_id, &info); + dev->rxq_size = RTE_MIN(dev->rxq_size, info.rx_desc_lim.nb_max); + dev->rxq_size = RTE_MAX(dev->rxq_size, info.rx_desc_lim.nb_min); + dev->txq_size = RTE_MIN(dev->txq_size, info.tx_desc_lim.nb_max); + dev->txq_size = RTE_MAX(dev->txq_size, info.tx_desc_lim.nb_min); + dev->mtu = + RTE_MIN(dev->mtu, info.max_rx_pktlen - ETHER_HDR_LEN - ETHER_CRC_LEN); + dev->mtu = RTE_MAX(dev->mtu, ETHER_MIN_MTU); + + /* 1) Create pktmbuf mempool for RX queue. */ + socket_id = rte_eth_dev_socket_id(port_id); + mp_size = dev->nb_rxq * dev->rxq_size + dev->nb_txq * dev->txq_size; + snprintf(mp_name, sizeof(mp_name), "mp%p", dev); + dev->mp = rte_pktmbuf_pool_create(mp_name, mp_size, + /* cache_size */ + 32, + /* priv_size */ + 0, + /* data_room_size */ + dev->mtu + ETHER_HDR_LEN + ETHER_CRC_LEN + + RTE_PKTMBUF_HEADROOM, + socket_id); + if (dev->mp == NULL) { + RTE_LOG(ERR, USER1, "%s(): Create pktmbuf mempool failed, %s.\n", + __func__, rte_strerror(rte_errno)); + return -1; + } + + /* 2) Config and start device. */ + memset(&dev_conf, 0, sizeof(dev_conf)); + dev_conf.rxmode.mq_mode = ETH_MQ_RX_RSS; + if (dev->mtu > ETHER_MTU) { + dev_conf.rxmode.max_rx_pkt_len = + dev->mtu + ETHER_HDR_LEN + ETHER_CRC_LEN; + dev_conf.rxmode.jumbo_frame = 1; + } + dev_conf.rx_adv_conf.rss_conf.rss_hf = ETH_RSS_PROTO_MASK; + dev_conf.fdir_conf.mode = RTE_FDIR_MODE_PERFECT; + dev_conf.fdir_conf.mask.ipv4_mask.src_ip = 0xFFFFFFFF; + dev_conf.fdir_conf.mask.ipv4_mask.dst_ip = 0xFFFFFFFF; + dev_conf.fdir_conf.mask.src_port_mask = 0xFFFF; + dev_conf.fdir_conf.mask.dst_port_mask = 0xFFFF; + dev_conf.fdir_conf.drop_queue = 127; + rc = rte_eth_dev_configure(port_id, dev->nb_rxq, dev->nb_txq, &dev_conf); + if (rc < 0) { + RTE_LOG(ERR, USER1, "%s(): config port%u failed, %s.\n", __func__, + port_id, strerror(-rc)); + return rc; + } + + for (i = 0; i < dev->nb_rxq; i++) { + rc = rte_eth_rx_queue_setup(port_id, i, dev->rxq_size, socket_id, NULL, + dev->mp); + if (rc < 0) { + RTE_LOG(ERR, USER1, "%s(): Setup the rxq%u of port%u failed, %s.\n", + __func__, i, port_id, strerror(-rc)); + return rc; + } + } + + for (i = 0; i < dev->nb_txq; i++) { + rc = rte_eth_tx_queue_setup(port_id, i, dev->txq_size, socket_id, NULL); + if (rc < 0) { + RTE_LOG(ERR, USER1, "%s(): Setup the txq%u of port%u failed, %s.\n", + __func__, i, port_id, strerror(-rc)); + return rc; + } + } + + rte_eth_promiscuous_enable(port_id); + + /* 3) Create KNI. */ + if (dev->type == LB_DEV_T_NORM || dev->type == LB_DEV_T_MASTER) { + memset(&kni_conf, 0, sizeof(kni_conf)); + memcpy(kni_conf.name, dev->name, RTE_KNI_NAMESIZE); + kni_conf.core_id = 0; + kni_conf.force_bind = 1; + kni_conf.group_id = port_id; + kni_conf.mbuf_size = + dev->mtu + ETHER_HDR_LEN + ETHER_CRC_LEN + RTE_PKTMBUF_HEADROOM; + kni_conf.addr = info.pci_dev->addr; + kni_conf.id = info.pci_dev->id; + + kni_ops.port_id = port_id; + kni_ops.change_mtu = NULL; + kni_ops.config_network_if = NULL; + + dev->kni = rte_kni_alloc(dev->mp, &kni_conf, &kni_ops); + if (dev->kni == NULL) { + RTE_LOG(ERR, USER1, "%s(): Create kni %s failed.\n", __func__, + dev->name); + return -1; + } + + if (kni_get_mac(dev->name, &dev->ha) < 0) { + RTE_LOG(ERR, USER1, "%s(): kni_set_mac failed.\n", __func__); + return -1; + } + } + + /* 4) Create tx buffers. */ + if (dev->type == LB_DEV_T_NORM || dev->type == LB_DEV_T_MASTER) { + uint32_t lcore_id; + + RTE_LCORE_FOREACH(lcore_id) { + if (lcore_id != rte_get_master_lcore() && + socket_id != rte_lcore_to_socket_id(lcore_id)) { + continue; + } + dev->tx_buffer[lcore_id] = rte_zmalloc_socket( + "tx-buffer", RTE_ETH_TX_BUFFER_SIZE(PKT_MAX_BURST), + RTE_CACHE_LINE_SIZE, socket_id); + if (dev->tx_buffer[lcore_id] == NULL) { + RTE_LOG(ERR, USER1, "%s(): Create tx pkt buffer failed.\n", + __func__); + return -1; + } + + rte_eth_tx_buffer_init(dev->tx_buffer[lcore_id], PKT_MAX_BURST); + rte_eth_tx_buffer_set_err_callback(dev->tx_buffer[lcore_id], + tx_buffer_callback, dev); + } + } + + /* 5) Create master-worker ring. */ + if (dev->type == LB_DEV_T_NORM || dev->type == LB_DEV_T_MASTER) { + char rname[RTE_RING_NAMESIZE]; + uint32_t size; + + snprintf(rname, sizeof(rname), "ring%p", dev); + size = PKT_MAX_BURST * dev->nb_rxq; + dev->ring = rte_ring_create(rname, size, socket_id, + RING_F_SC_DEQ | RING_F_EXACT_SZ); + if (dev->ring == NULL) { + RTE_LOG(ERR, USER1, "%s(): Create master-worker ring failed.\n", + __func__); + return -1; + } + } + + /* 6) Create local address. */ + if (dev->type == LB_DEV_T_NORM || dev->type == LB_DEV_T_MASTER) { + rc = laddr_init(port_id); + if (rc < 0) { + RTE_LOG(ERR, USER1, "%s(): Create local address failed.\n", + __func__); + return -1; + } + } + + rc = rte_eth_dev_start(port_id); + if (rc < 0) { + RTE_LOG(ERR, USER1, "%s(): Start port%u failed, %s.\n", __func__, + port_id, strerror(-rc)); + return rc; + } + + return 0; +} + +int +lb_device_init(struct lb_device_conf *configs, uint16_t num) { + struct lb_device_conf *conf; + uint16_t i, port_id; + int rc; + char pci_name[PCI_PRI_STR_SIZE]; + uint32_t socket_id, lcore_id; + struct lb_device *dev; + uint16_t qid; + struct lb_laddr_list *laddr_list; + uint32_t j = 0, avg, lip_id; + + RTE_LOG(INFO, USER1, "%s(): lb_devces[%u] size = %luKB\n", __func__, + RTE_MAX_ETHPORTS, (sizeof(lb_devices) + 1023) / 1024); + + /* 0) Initialize kni. */ + rte_kni_init(num); + + /* 1) Initialize normal device. */ + for (i = 0; i < num; i++) { + conf = &configs[i]; + + if (conf->nb_pcis != 1) + continue; + + /* a) Get the port id by pci address. */ + rte_pci_device_name(&conf->pcis[0], pci_name, sizeof(pci_name)); + rc = rte_eth_dev_get_port_by_name(pci_name, &port_id); + if (rc < 0) { + RTE_LOG(ERR, USER1, + "%s(): Get port id from pci address(%s) failed.\n", + __func__, pci_name); + return rc; + } + + /* b) Get the socket id by port id. */ + rc = rte_eth_dev_socket_id(port_id); + if (rc < 0) { + RTE_LOG(ERR, USER1, "%s(): Get the socket id of port%u failed.\n", + __func__, port_id); + return rc; + } + socket_id = rc; + + /* c) Get the lcores by socket id. */ + dev = &lb_devices[port_id]; + qid = 0; + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + if (rte_lcore_to_socket_id(lcore_id) == socket_id) { + dev->lcore_conf[lcore_id].rxq_enable = 1; + dev->lcore_conf[lcore_id].rxq_id = qid; + dev->lcore_conf[lcore_id].txq_id = qid; + qid++; + } + } + lcore_id = rte_get_master_lcore(); + dev->lcore_conf[lcore_id].txq_id = qid; + + dev->nb_rxq = qid; + dev->nb_txq = qid + 1; + + /* d) Copy config info to device. */ + dev->rxq_size = conf->rxqsize; + dev->txq_size = conf->txqsize; + dev->rx_offload = conf->rxoffload; + dev->tx_offload = conf->txoffload; + dev->ipv4 = conf->ipv4; + dev->netmask = conf->netmask; + dev->gw = conf->gw; + dev->mtu = conf->mtu; + memcpy(dev->name, conf->name, sizeof(dev->name)); + + avg = conf->nb_lips / dev->nb_rxq; + if (avg == 0) { + RTE_LOG(ERR, USER1, + "%s(): The number of local IPv4 is less than the number of " + "RX queue of %s.\n", + __func__, dev->name); + return -1; + } + lip_id = 0; + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + laddr_list = &dev->laddr_list[lcore_id]; + laddr_list->nb = avg; + for (j = 0; j < avg; j++) { + laddr_list->entries[j].ipv4 = conf->lips[lip_id]; + laddr_list->entries[j].port_id = i; + laddr_list->entries[j].rxq_id = + dev->lcore_conf[lcore_id].rxq_id; + lip_id++; + } + } + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + if (lip_id == conf->nb_lips) { + break; + } + + laddr_list = &dev->laddr_list[lcore_id]; + laddr_list->nb += 1; + laddr_list->entries[j].ipv4 = conf->lips[lip_id]; + laddr_list->entries[j].port_id = i; + laddr_list->entries[j].rxq_id = dev->lcore_conf[lcore_id].rxq_id; + lip_id++; + } + + /* e) Initialize queue, kni, mp, txbuffer, ring. */ + dev->type = LB_DEV_T_NORM; + rc = dpdk_device_init(port_id); + if (rc < 0) { + RTE_LOG(ERR, USER1, "%s(): Initialize port%u failed.\n", __func__, + port_id); + return rc; + } + } + + /* 2) Initialize bond device. */ + for (i = 0; i < num; i++) { + } + + return 0; +} + +/* UNIXCTL COMMANDS */ + +/* + Returns: + throughput = [pps_rx, pps_tx, bps_rx, bps_tx] +*/ +static void +netdev_throughput_get(struct rte_eth_stats *stats, uint64_t throughput[]) { + static uint64_t prev_pkts_rx, prev_bytes_rx; + static uint64_t prev_pkts_tx, prev_bytes_tx; + static uint64_t prev_cycles; + uint64_t diff_pkts_rx, diff_pkts_tx, diff_cycles; + uint64_t diff_bytes_rx, diff_bytes_tx; + + diff_cycles = prev_cycles; + prev_cycles = rte_rdtsc(); + + if (diff_cycles > 0) { + diff_cycles = prev_cycles - diff_cycles; + } + + diff_pkts_rx = stats->ipackets - prev_pkts_rx; + diff_pkts_tx = stats->opackets - prev_pkts_tx; + prev_pkts_rx = stats->ipackets; + prev_pkts_tx = stats->opackets; + throughput[0] = + diff_cycles > 0 ? diff_pkts_rx * rte_get_tsc_hz() / diff_cycles : 0; + throughput[1] = + diff_cycles > 0 ? diff_pkts_tx * rte_get_tsc_hz() / diff_cycles : 0; + + diff_bytes_rx = stats->ibytes - prev_bytes_rx; + diff_bytes_tx = stats->obytes - prev_bytes_tx; + prev_bytes_rx = stats->ibytes; + prev_bytes_tx = stats->obytes; + throughput[2] = + diff_cycles > 0 ? diff_bytes_rx * rte_get_tsc_hz() / diff_cycles : 0; + throughput[3] = + diff_cycles > 0 ? diff_bytes_tx * rte_get_tsc_hz() / diff_cycles : 0; +} + +static void +netdev_show_stats_cmd_cb(int fd, char *argv[], int argc) { + uint16_t nb_ports, port_id; + int json_fmt, json_first_obj = 1; + struct lb_device *dev; + struct rte_eth_stats stats; + uint32_t lcore_id; + uint64_t tx_dropped; + uint64_t rx_dropped; + uint64_t throughput[4]; + uint32_t mbuf_in_use, mbuf_avail; + + if (argc > 0 && strcmp(argv[0], "--json") == 0) { + json_fmt = 1; + } else { + json_fmt = 0; + } + + if (json_fmt) + unixctl_command_reply(fd, "["); + + nb_ports = rte_eth_dev_count(); + for (port_id = 0; port_id < nb_ports; port_id++) { + dev = &lb_devices[port_id]; + if (dev->type != LB_DEV_T_NORM && dev->type != LB_DEV_T_MASTER) { + continue; + } + + tx_dropped = 0; + rx_dropped = 0; + memset(throughput, 0, sizeof(throughput)); + + rte_eth_stats_get(port_id, &stats); + RTE_LCORE_FOREACH(lcore_id) { + tx_dropped += dev->lcore_stats[lcore_id].tx_dropped; + rx_dropped += dev->lcore_stats[lcore_id].rx_dropped; + } + netdev_throughput_get(&stats, throughput); + + mbuf_in_use = rte_mempool_in_use_count(dev->mp); + mbuf_avail = rte_mempool_avail_count(dev->mp); + + if (json_fmt) { + unixctl_command_reply(fd, json_first_obj ? "{" : ",{"); + json_first_obj = 0; + } + + unixctl_command_reply(fd, + json_fmt ? JSON_KV_S_FMT("dev", ",") + : NORM_KV_S_FMT("dev", "\n"), + dev->name); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_64_FMT("RX-packets", ",") + : NORM_KV_64_FMT(" RX-packets", "\n"), + stats.ipackets); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_64_FMT("RX-bytes", ",") + : NORM_KV_64_FMT(" RX-bytes", "\n"), + stats.ibytes); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_64_FMT("RX-errors", ",") + : NORM_KV_64_FMT(" RX-errors", "\n"), + stats.ierrors); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_64_FMT("RX-nombuf", ",") + : NORM_KV_64_FMT(" RX-nombuf", "\n"), + stats.rx_nombuf); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_64_FMT("RX-misses", ",") + : NORM_KV_64_FMT(" RX-misses", "\n"), + stats.imissed); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_64_FMT("RX-dropped", ",") + : NORM_KV_64_FMT(" RX-dropped", "\n"), + rx_dropped); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_64_FMT("TX-packets", ",") + : NORM_KV_64_FMT(" TX-packets", "\n"), + stats.opackets); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_64_FMT("TX-bytes", ",") + : NORM_KV_64_FMT(" TX-bytes", "\n"), + stats.obytes); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_64_FMT("TX-errors", ",") + : NORM_KV_64_FMT(" TX-errors", "\n"), + stats.oerrors); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_64_FMT("TX-dropped", ",") + : NORM_KV_64_FMT(" TX-dropped", "\n"), + tx_dropped); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_64_FMT("Rx-pps", ",") + : NORM_KV_64_FMT(" Rx-pps", "\n"), + throughput[0]); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_64_FMT("Tx-pps", ",") + : NORM_KV_64_FMT(" Tx-pps", "\n"), + throughput[1]); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_64_FMT("Rx-Bps", ",") + : NORM_KV_64_FMT(" Rx-Bps", "\n"), + throughput[2]); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_64_FMT("Tx-Bps", ",") + : NORM_KV_64_FMT(" Tx-Bps", "\n"), + throughput[3]); + unixctl_command_reply(fd, + json_fmt + ? JSON_KV_32_FMT("pktmbuf-in-use", ",") + : NORM_KV_32_FMT(" pktmbuf-in-use", "\n"), + mbuf_in_use); + unixctl_command_reply(fd, + json_fmt + ? JSON_KV_32_FMT("pktmbuf-avail", "}") + : NORM_KV_32_FMT(" pktmbuf-avail", "\n"), + mbuf_avail); + } + if (json_fmt) + unixctl_command_reply(fd, "]\n"); +} + +UNIXCTL_CMD_REGISTER("netdev/stats", "[--json]", "Show NIC packet statistics.", + 0, 1, netdev_show_stats_cmd_cb); + +static void +netdev_reset_stats_cmd_cb(__attribute__((unused)) int fd, + __attribute__((unused)) char *argv[], + __attribute__((unused)) int argc) { + uint32_t port_id, nb_ports; + struct lb_device *dev; + + nb_ports = rte_eth_dev_count(); + for (port_id = 0; port_id < nb_ports; port_id++) { + dev = &lb_devices[port_id]; + if (dev->type != LB_DEV_T_NORM && dev->type != LB_DEV_T_MASTER) { + continue; + } + + rte_eth_stats_reset(port_id); + memset(dev->lcore_stats, 0, sizeof(dev->lcore_stats)); + } +} + +UNIXCTL_CMD_REGISTER("netdev/reset", "", "Reset NIC packet statistics.", 0, 0, + netdev_reset_stats_cmd_cb); + +static void +netdev_show_ipaddr_cmd_cb(int fd, __attribute__((unused)) char *argv[], + __attribute__((unused)) int argc) { + uint32_t port_id, nb_ports; + struct lb_device *dev; + struct lb_laddr_list *laddr_list; + struct lb_laddr *laddr; + char buf[32]; + uint32_t lcore_id; + uint32_t i; + + nb_ports = rte_eth_dev_count(); + for (port_id = 0; port_id < nb_ports; port_id++) { + dev = &lb_devices[port_id]; + if (dev->type != LB_DEV_T_NORM && dev->type != LB_DEV_T_MASTER) { + continue; + } + + unixctl_command_reply(fd, "dev: %s\n", dev->name); + ipv4_addr_tostring(dev->ipv4, buf, sizeof(buf)); + unixctl_command_reply(fd, " kni-ip: %s\n", buf); + ipv4_addr_tostring(dev->netmask, buf, sizeof(buf)); + unixctl_command_reply(fd, " kni-netmask: %s\n", buf); + ipv4_addr_tostring(dev->gw, buf, sizeof(buf)); + unixctl_command_reply(fd, " kni-gw: %s\n", buf); + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + laddr_list = &dev->laddr_list[lcore_id]; + for (i = 0; i < laddr_list->nb; i++) { + laddr = &laddr_list->entries[i]; + ipv4_addr_tostring(laddr->ipv4, buf, sizeof(buf)); + unixctl_command_reply(fd, " local-ip[c%uq%u]: %s\n", lcore_id, + laddr->rxq_id, buf); + } + } + } +} + +UNIXCTL_CMD_REGISTER("netdev/ipaddr", "", "Show KNI/LOCAL ipv4 address.", 0, 0, + netdev_show_ipaddr_cmd_cb); + +static void +netdev_show_hwinfo_cmd_cb(int fd, __attribute__((unused)) char *argv[], + __attribute__((unused)) int argc) { + uint32_t port_id, nb_ports; + struct lb_device *dev; + char mac[32]; + struct rte_eth_link link_params; + + nb_ports = rte_eth_dev_count(); + for (port_id = 0; port_id < nb_ports; port_id++) { + dev = &lb_devices[port_id]; + if (dev->type != LB_DEV_T_NORM && dev->type != LB_DEV_T_MASTER) { + continue; + } + + unixctl_command_reply(fd, "dev: %s\n", dev->name); + mac_addr_tostring(&dev->ha, mac, sizeof(mac)); + unixctl_command_reply(fd, " hw: %s\n", mac); + + unixctl_command_reply(fd, " rxq-num: %u\n", dev->nb_rxq); + memset(&link_params, 0, sizeof(link_params)); + rte_eth_link_get(0, &link_params); + unixctl_command_reply(fd, " link-status: %s\n", + link_params.link_status == ETH_LINK_DOWN ? "DOWN" + : "UP"); + } +} + +UNIXCTL_CMD_REGISTER("netdev/hwinfo", "", "Show NIC link-status.", 0, 0, + netdev_show_hwinfo_cmd_cb); + +static char * +flowtype_to_str(uint16_t flow_type) { + struct flow_type_info { + char str[32]; + uint16_t ftype; + }; + + uint8_t i; + static struct flow_type_info flowtype_str_table[] = { + {"raw", RTE_ETH_FLOW_RAW}, + {"ipv4", RTE_ETH_FLOW_IPV4}, + {"ipv4-frag", RTE_ETH_FLOW_FRAG_IPV4}, + {"ipv4-tcp", RTE_ETH_FLOW_NONFRAG_IPV4_TCP}, + {"ipv4-udp", RTE_ETH_FLOW_NONFRAG_IPV4_UDP}, + {"ipv4-sctp", RTE_ETH_FLOW_NONFRAG_IPV4_SCTP}, + {"ipv4-other", RTE_ETH_FLOW_NONFRAG_IPV4_OTHER}, + {"ipv6", RTE_ETH_FLOW_IPV6}, + {"ipv6-frag", RTE_ETH_FLOW_FRAG_IPV6}, + {"ipv6-tcp", RTE_ETH_FLOW_NONFRAG_IPV6_TCP}, + {"ipv6-udp", RTE_ETH_FLOW_NONFRAG_IPV6_UDP}, + {"ipv6-sctp", RTE_ETH_FLOW_NONFRAG_IPV6_SCTP}, + {"ipv6-other", RTE_ETH_FLOW_NONFRAG_IPV6_OTHER}, + {"l2_payload", RTE_ETH_FLOW_L2_PAYLOAD}, + {"port", RTE_ETH_FLOW_PORT}, + {"vxlan", RTE_ETH_FLOW_VXLAN}, + {"geneve", RTE_ETH_FLOW_GENEVE}, + {"nvgre", RTE_ETH_FLOW_NVGRE}, + }; + + for (i = 0; i < RTE_DIM(flowtype_str_table); i++) { + if (flowtype_str_table[i].ftype == flow_type) + return flowtype_str_table[i].str; + } + + return NULL; +} + +static inline void +print_fdir_flex_mask(int fd, struct rte_eth_fdir_flex_conf *flex_conf, + uint32_t num) { + struct rte_eth_fdir_flex_mask *mask; + uint32_t i, j; + char *p; + + for (i = 0; i < flex_conf->nb_flexmasks; i++) { + mask = &flex_conf->flex_mask[i]; + p = flowtype_to_str(mask->flow_type); + unixctl_command_reply(fd, "\n %s:\t", p ? p : "unknown"); + for (j = 0; j < num; j++) + unixctl_command_reply(fd, " %02x", mask->mask[j]); + } + unixctl_command_reply(fd, "\n"); +} + +static inline void +print_fdir_mask(int fd, struct rte_eth_fdir_masks *mask, + enum rte_fdir_mode mode) { + unixctl_command_reply(fd, "\n vlan_tci: 0x%04x", + rte_be_to_cpu_16(mask->vlan_tci_mask)); + + if (mode == RTE_FDIR_MODE_PERFECT_TUNNEL) + unixctl_command_reply(fd, + ", mac_addr: 0x%02x, tunnel_type: 0x%01x," + " tunnel_id: 0x%08x", + mask->mac_addr_byte_mask, mask->tunnel_type_mask, + rte_be_to_cpu_32(mask->tunnel_id_mask)); + else if (mode != RTE_FDIR_MODE_PERFECT_MAC_VLAN) { + unixctl_command_reply(fd, ", src_ipv4: 0x%08x, dst_ipv4: 0x%08x", + rte_be_to_cpu_32(mask->ipv4_mask.src_ip), + rte_be_to_cpu_32(mask->ipv4_mask.dst_ip)); + + unixctl_command_reply(fd, "\n src_port: 0x%04x, dst_port: 0x%04x", + rte_be_to_cpu_16(mask->src_port_mask), + rte_be_to_cpu_16(mask->dst_port_mask)); + + unixctl_command_reply(fd, "\n src_ipv6: 0x%08x,0x%08x,0x%08x,0x%08x", + rte_be_to_cpu_32(mask->ipv6_mask.src_ip[0]), + rte_be_to_cpu_32(mask->ipv6_mask.src_ip[1]), + rte_be_to_cpu_32(mask->ipv6_mask.src_ip[2]), + rte_be_to_cpu_32(mask->ipv6_mask.src_ip[3])); + + unixctl_command_reply(fd, "\n dst_ipv6: 0x%08x,0x%08x,0x%08x,0x%08x", + rte_be_to_cpu_32(mask->ipv6_mask.dst_ip[0]), + rte_be_to_cpu_32(mask->ipv6_mask.dst_ip[1]), + rte_be_to_cpu_32(mask->ipv6_mask.dst_ip[2]), + rte_be_to_cpu_32(mask->ipv6_mask.dst_ip[3])); + } + + unixctl_command_reply(fd, "\n"); +} + +static inline void +print_fdir_flex_payload(int fd, struct rte_eth_fdir_flex_conf *flex_conf, + uint32_t num) { + struct rte_eth_flex_payload_cfg *cfg; + uint32_t i, j; + + for (i = 0; i < flex_conf->nb_payloads; i++) { + cfg = &flex_conf->flex_set[i]; + if (cfg->type == RTE_ETH_RAW_PAYLOAD) + unixctl_command_reply(fd, "\n RAW: "); + else if (cfg->type == RTE_ETH_L2_PAYLOAD) + unixctl_command_reply(fd, "\n L2_PAYLOAD: "); + else if (cfg->type == RTE_ETH_L3_PAYLOAD) + unixctl_command_reply(fd, "\n L3_PAYLOAD: "); + else if (cfg->type == RTE_ETH_L4_PAYLOAD) + unixctl_command_reply(fd, "\n L4_PAYLOAD: "); + else + unixctl_command_reply(fd, + "\n UNKNOWN PAYLOAD(%u): ", cfg->type); + for (j = 0; j < num; j++) + unixctl_command_reply(fd, " %-5u", cfg->src_offset[j]); + } + unixctl_command_reply(fd, "\n"); +} + +static inline void +print_fdir_flow_type(int fd, uint32_t flow_types_mask) { + int i; + char *p; + + for (i = RTE_ETH_FLOW_UNKNOWN; i < RTE_ETH_FLOW_MAX; i++) { + if (!(flow_types_mask & (1 << i))) + continue; + p = flowtype_to_str(i); + if (p) + unixctl_command_reply(fd, " %s", p); + else + unixctl_command_reply(fd, " unknown"); + } + unixctl_command_reply(fd, "\n"); +} + +static void +netdev_show_fdir_cmd_cb(int fd, __attribute__((unused)) char *argv[], + __attribute__((unused)) int argc) { + struct rte_eth_fdir_stats fdir_stat; + struct rte_eth_fdir_info fdir_info; + uint16_t nb_ports, port_id; + struct lb_device *dev; + int ret; + + nb_ports = rte_eth_dev_count(); + for (port_id = 0; port_id < nb_ports; port_id++) { + dev = &lb_devices[port_id]; + if (dev->type != LB_DEV_T_NORM && dev->type != LB_DEV_T_MASTER) { + continue; + } + + static const char *fdir_stats_border = "########################"; + + ret = rte_eth_dev_filter_supported(port_id, RTE_ETH_FILTER_FDIR); + if (ret < 0) { + unixctl_command_reply(fd, "\n FDIR is not supported on port %-2d\n", + port_id); + return; + } + + memset(&fdir_info, 0, sizeof(fdir_info)); + rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_FDIR, + RTE_ETH_FILTER_INFO, &fdir_info); + memset(&fdir_stat, 0, sizeof(fdir_stat)); + rte_eth_dev_filter_ctrl(port_id, RTE_ETH_FILTER_FDIR, + RTE_ETH_FILTER_STATS, &fdir_stat); + unixctl_command_reply(fd, "\n%s FDIR infos for port %s %s\n", + fdir_stats_border, dev->name, fdir_stats_border); + unixctl_command_reply(fd, " MODE: "); + if (fdir_info.mode == RTE_FDIR_MODE_PERFECT) + unixctl_command_reply(fd, " PERFECT\n"); + else if (fdir_info.mode == RTE_FDIR_MODE_PERFECT_MAC_VLAN) + unixctl_command_reply(fd, " PERFECT-MAC-VLAN\n"); + else if (fdir_info.mode == RTE_FDIR_MODE_PERFECT_TUNNEL) + unixctl_command_reply(fd, " PERFECT-TUNNEL\n"); + else if (fdir_info.mode == RTE_FDIR_MODE_SIGNATURE) + unixctl_command_reply(fd, " SIGNATURE\n"); + else + unixctl_command_reply(fd, " DISABLE\n"); + if (fdir_info.mode != RTE_FDIR_MODE_PERFECT_MAC_VLAN && + fdir_info.mode != RTE_FDIR_MODE_PERFECT_TUNNEL) { + unixctl_command_reply(fd, " SUPPORTED FLOW TYPE: "); + print_fdir_flow_type(fd, fdir_info.flow_types_mask[0]); + } + unixctl_command_reply(fd, " FLEX PAYLOAD INFO:\n"); + unixctl_command_reply( + fd, + " max_len: %-10" PRIu32 " payload_limit: %-10" PRIu32 "\n" + " payload_unit: %-10" PRIu32 " payload_seg: %-10" PRIu32 "\n" + " bitmask_unit: %-10" PRIu32 " bitmask_num: %-10" PRIu32 "\n", + fdir_info.max_flexpayload, fdir_info.flex_payload_limit, + fdir_info.flex_payload_unit, fdir_info.max_flex_payload_segment_num, + fdir_info.flex_bitmask_unit, fdir_info.max_flex_bitmask_num); + unixctl_command_reply(fd, " MASK: "); + print_fdir_mask(fd, &fdir_info.mask, fdir_info.mode); + if (fdir_info.flex_conf.nb_payloads > 0) { + unixctl_command_reply(fd, " FLEX PAYLOAD SRC OFFSET:"); + print_fdir_flex_payload(fd, &fdir_info.flex_conf, + fdir_info.max_flexpayload); + } + if (fdir_info.flex_conf.nb_flexmasks > 0) { + unixctl_command_reply(fd, " FLEX MASK CFG:"); + print_fdir_flex_mask(fd, &fdir_info.flex_conf, + fdir_info.max_flexpayload); + } + unixctl_command_reply( + fd, " guarant_count: %-10" PRIu32 " best_count: %" PRIu32 "\n", + fdir_stat.guarant_cnt, fdir_stat.best_cnt); + unixctl_command_reply( + fd, " guarant_space: %-10" PRIu32 " best_space: %" PRIu32 "\n", + fdir_info.guarant_spc, fdir_info.best_spc); + unixctl_command_reply( + fd, + " collision: %-10" PRIu32 " free: %" PRIu32 "\n" + " maxhash: %-10" PRIu32 " maxlen: %" PRIu32 "\n" + " add: %-10" PRIu64 " remove: %" PRIu64 "\n" + " f_add: %-10" PRIu64 " f_remove: %" PRIu64 "\n", + fdir_stat.collision, fdir_stat.free, fdir_stat.maxhash, + fdir_stat.maxlen, fdir_stat.add, fdir_stat.remove, fdir_stat.f_add, + fdir_stat.f_remove); + unixctl_command_reply(fd, "%s############################%s\n", + fdir_stats_border, fdir_stats_border); + } +} + +UNIXCTL_CMD_REGISTER("netdev/fdir", "", "Show NIC FDIR.", 0, 0, + netdev_show_fdir_cmd_cb); + diff --git a/core/lb_device.h b/core/lb_device.h new file mode 100644 index 0000000..4b24908 --- /dev/null +++ b/core/lb_device.h @@ -0,0 +1,199 @@ +/* Copyright (c) 2018. TIG developer. */ + +#ifndef __LB_DEVICE_H__ +#define __LB_DEVICE_H__ + +#include +#include +#include +#include +#include +#include + +#include "lb_arp.h" +#include "lb_proto.h" +#include "lb_config.h" + +#define PKT_MAX_BURST 32 + +#define LB_MIN_L4_PORT (1024) +#define LB_MAX_L4_PORT (65535) + +enum { + LB_DEV_T_NORM = 0, /* Normal port. */ + LB_DEV_T_MASTER, /* Master bond port. */ + LB_DEV_T_SLAVE, /* Slave bond port. */ +}; + +struct lb_laddr { + uint32_t ipv4; + uint16_t port_id; + uint16_t rxq_id; + struct rte_ring *ports[LB_IPPROTO_MAX]; +}; + +struct lb_laddr_list { + uint32_t nb; + struct lb_laddr entries[LB_MAX_LADDR]; +}; + +struct lb_device { + uint16_t type; + + /* Master bond port. */ + uint16_t master; + + struct ether_addr ha; + uint16_t mtu; + + uint32_t ipv4; + uint32_t netmask; + uint32_t gw; + + uint16_t nb_rxq, nb_txq; + uint16_t rxq_size, txq_size; + + uint32_t rx_offload; + uint32_t tx_offload; + + struct { + uint32_t rxq_enable; + uint16_t rxq_id; + uint16_t txq_id; + } lcore_conf[RTE_MAX_LCORE]; + + struct { + uint64_t rx_dropped; + uint64_t tx_dropped; + } lcore_stats[RTE_MAX_LCORE]; + + struct rte_eth_dev_tx_buffer *tx_buffer[RTE_MAX_LCORE]; + + char name[RTE_KNI_NAMESIZE]; + + struct rte_kni *kni; + + struct rte_mempool *mp; + + /* master-worker threads communication */ + struct rte_ring *ring; + + struct lb_laddr_list laddr_list[RTE_MAX_LCORE]; +}; + +extern struct lb_device lb_devices[RTE_MAX_ETHPORTS]; + +static inline int +lb_is_laddr_exist(uint32_t lip, uint16_t port_id) { + struct lb_laddr_list *list; + uint32_t lcore_id = rte_lcore_id(); + uint32_t i; + + list = &lb_devices[port_id].laddr_list[lcore_id]; + for (i = 0; i < list->nb; i++) { + if (lip == list->entries[i].ipv4) + return 1; + } + return 0; +} + +static inline int +lb_laddr_get(uint16_t port_id, enum lb_proto_type type, struct lb_laddr **laddr, + uint16_t *port) { + struct lb_device *dev; + struct lb_laddr_list *list; + struct lb_laddr *addr; + void *p = NULL; + uint32_t lcore_id, i; + + lcore_id = rte_lcore_id(); + dev = &lb_devices[port_id]; + list = &dev->laddr_list[lcore_id]; + + for (i = 0; i < list->nb; i++) { + addr = &list->entries[i]; + if (rte_ring_sc_dequeue(addr->ports[type], (void **)&p) == 0) { + *laddr = addr; + *port = (uint16_t)(uintptr_t)p; + return 0; + } + } + return -1; +} + +static inline void +lb_laddr_put(struct lb_laddr *laddr, uint16_t port, enum lb_proto_type type) { + rte_ring_sp_enqueue(laddr->ports[type], (void *)(uintptr_t)port); +} + +#define IS_SAME_NETWORK(addr1, addr2, netmask) \ + ((addr1 & netmask) == (addr2 & netmask)) + +static inline int +lb_device_dst_mac_find(uint32_t dip, struct ether_addr *ea, uint16_t port_id) { + struct lb_device *dev = &lb_devices[port_id]; + uint32_t rip; + int rc; + + if (IS_SAME_NETWORK(dip, dev->ipv4, dev->netmask)) { + rip = dip; + } else { + rip = dev->gw; + } + + rc = lb_arp_find(rip, ea, port_id); + if (rc < 0) { + lb_arp_request(rip, port_id); + } + + return rc; +} + +static inline int +lb_device_output(struct rte_mbuf *m, struct ipv4_hdr *iph, uint16_t port_id) { + struct lb_device *dev; + struct ether_hdr *eth; + uint32_t lcore_id; + uint16_t txq_id; + struct rte_eth_dev_tx_buffer *tx_buffer; + int rc; + + dev = &lb_devices[port_id]; + eth = rte_pktmbuf_mtod(m, struct ether_hdr *); + + rc = lb_device_dst_mac_find(iph->dst_addr, ð->d_addr, port_id); + if (rc < 0) { + rte_pktmbuf_free(m); + return rc; + } + ether_addr_copy(&dev->ha, ð->s_addr); + eth->ether_type = rte_cpu_to_be_16(ETHER_TYPE_IPv4); + + lcore_id = rte_lcore_id(); + txq_id = dev->lcore_conf[lcore_id].txq_id; + tx_buffer = dev->tx_buffer[lcore_id]; + rte_eth_tx_buffer(port_id, txq_id, tx_buffer, m); + + return 0; +} + +static inline struct rte_mbuf * +lb_device_pktmbuf_alloc(uint16_t port_id) { + struct lb_device *dev; + + dev = &lb_devices[port_id]; + return rte_pktmbuf_alloc(dev->mp); +} + +static inline struct rte_mbuf * +lb_device_pktmbuf_clone(struct rte_mbuf *m, uint16_t port_id) { + struct lb_device *dev; + + dev = &lb_devices[port_id]; + return rte_pktmbuf_clone(m, dev->mp); +} + +int lb_device_init(struct lb_device_conf *configs, uint16_t num); + +#endif /* __LB_DEVICE_H__ */ + diff --git a/core/lb_format.h b/core/lb_format.h new file mode 100644 index 0000000..2ed8f20 --- /dev/null +++ b/core/lb_format.h @@ -0,0 +1,22 @@ +/* Copyright (c) 2018. TIG developer. */ + +#ifndef __LB_FORMAT_H__ +#define __LB_FORMAT_H__ + +#define JSON_KV_S_FMT(K, D) "\"" K "\"" ":" "\"" "%s" "\"" D +#define JSON_KV_64_FMT(K, D) "\"" K "\"" ":" "%" PRIu64 D +#define JSON_KV_32_FMT(K, D) "\"" K "\"" ":" "%" PRIu32 D + +#define NORM_KV_S_FMT(K, D) K ": %s" D +#define NORM_KV_64_FMT(K, D) K ": %" PRIu64 D +#define NORM_KV_32_FMT(K, D) K ": %" PRIu32 D + +#define IPv4_BE_FMT "%u.%u.%u.%u" +#define IPv4_BE_ARG(ip) (ip & 0xff),((ip & 0xff00) >> 8),((ip & 0xff0000) >> 16),((ip & 0xff000000) >> 24) + +#define IPv4_BE(a, b, c, d) IPv4(d, c, b, a) + +#define IPv4_FMT "%u.%u.%u.%u" +#define IPv4_ARG(ip) ((ip & 0xff000000) >> 24),((ip & 0xff0000) >> 16),((ip & 0xff00) >> 8),(ip & 0xff) + +#endif diff --git a/core/lb_md5.h b/core/lb_md5.h new file mode 100644 index 0000000..df6f099 --- /dev/null +++ b/core/lb_md5.h @@ -0,0 +1,101 @@ +/* Copyright (c) 2018. TIG developer. */ + +#ifndef __LB_MD5_H__ +#define __LB_MD5_H__ + +#define MD5_DIGEST_WORDS 4 +#define MD5_MESSAGE_BYTES 64 + +#define F1(x, y, z) (z ^ (x & (y ^ z))) +#define F2(x, y, z) F1(z, x, y) +#define F3(x, y, z) (x ^ y ^ z) +#define F4(x, y, z) (y ^ (x | ~z)) + +#define MD5STEP(f, w, x, y, z, in, s) \ + (w += f(x, y, z) + in, w = (w << s | w >> (32 - s)) + x) + +static inline void +md5_transform(uint32_t *hash, uint32_t const *in) { + uint32_t a, b, c, d; + + a = hash[0]; + b = hash[1]; + c = hash[2]; + d = hash[3]; + + MD5STEP(F1, a, b, c, d, in[0] + 0xd76aa478, 7); + MD5STEP(F1, d, a, b, c, in[1] + 0xe8c7b756, 12); + MD5STEP(F1, c, d, a, b, in[2] + 0x242070db, 17); + MD5STEP(F1, b, c, d, a, in[3] + 0xc1bdceee, 22); + MD5STEP(F1, a, b, c, d, in[4] + 0xf57c0faf, 7); + MD5STEP(F1, d, a, b, c, in[5] + 0x4787c62a, 12); + MD5STEP(F1, c, d, a, b, in[6] + 0xa8304613, 17); + MD5STEP(F1, b, c, d, a, in[7] + 0xfd469501, 22); + MD5STEP(F1, a, b, c, d, in[8] + 0x698098d8, 7); + MD5STEP(F1, d, a, b, c, in[9] + 0x8b44f7af, 12); + MD5STEP(F1, c, d, a, b, in[10] + 0xffff5bb1, 17); + MD5STEP(F1, b, c, d, a, in[11] + 0x895cd7be, 22); + MD5STEP(F1, a, b, c, d, in[12] + 0x6b901122, 7); + MD5STEP(F1, d, a, b, c, in[13] + 0xfd987193, 12); + MD5STEP(F1, c, d, a, b, in[14] + 0xa679438e, 17); + MD5STEP(F1, b, c, d, a, in[15] + 0x49b40821, 22); + + MD5STEP(F2, a, b, c, d, in[1] + 0xf61e2562, 5); + MD5STEP(F2, d, a, b, c, in[6] + 0xc040b340, 9); + MD5STEP(F2, c, d, a, b, in[11] + 0x265e5a51, 14); + MD5STEP(F2, b, c, d, a, in[0] + 0xe9b6c7aa, 20); + MD5STEP(F2, a, b, c, d, in[5] + 0xd62f105d, 5); + MD5STEP(F2, d, a, b, c, in[10] + 0x02441453, 9); + MD5STEP(F2, c, d, a, b, in[15] + 0xd8a1e681, 14); + MD5STEP(F2, b, c, d, a, in[4] + 0xe7d3fbc8, 20); + MD5STEP(F2, a, b, c, d, in[9] + 0x21e1cde6, 5); + MD5STEP(F2, d, a, b, c, in[14] + 0xc33707d6, 9); + MD5STEP(F2, c, d, a, b, in[3] + 0xf4d50d87, 14); + MD5STEP(F2, b, c, d, a, in[8] + 0x455a14ed, 20); + MD5STEP(F2, a, b, c, d, in[13] + 0xa9e3e905, 5); + MD5STEP(F2, d, a, b, c, in[2] + 0xfcefa3f8, 9); + MD5STEP(F2, c, d, a, b, in[7] + 0x676f02d9, 14); + MD5STEP(F2, b, c, d, a, in[12] + 0x8d2a4c8a, 20); + + MD5STEP(F3, a, b, c, d, in[5] + 0xfffa3942, 4); + MD5STEP(F3, d, a, b, c, in[8] + 0x8771f681, 11); + MD5STEP(F3, c, d, a, b, in[11] + 0x6d9d6122, 16); + MD5STEP(F3, b, c, d, a, in[14] + 0xfde5380c, 23); + MD5STEP(F3, a, b, c, d, in[1] + 0xa4beea44, 4); + MD5STEP(F3, d, a, b, c, in[4] + 0x4bdecfa9, 11); + MD5STEP(F3, c, d, a, b, in[7] + 0xf6bb4b60, 16); + MD5STEP(F3, b, c, d, a, in[10] + 0xbebfbc70, 23); + MD5STEP(F3, a, b, c, d, in[13] + 0x289b7ec6, 4); + MD5STEP(F3, d, a, b, c, in[0] + 0xeaa127fa, 11); + MD5STEP(F3, c, d, a, b, in[3] + 0xd4ef3085, 16); + MD5STEP(F3, b, c, d, a, in[6] + 0x04881d05, 23); + MD5STEP(F3, a, b, c, d, in[9] + 0xd9d4d039, 4); + MD5STEP(F3, d, a, b, c, in[12] + 0xe6db99e5, 11); + MD5STEP(F3, c, d, a, b, in[15] + 0x1fa27cf8, 16); + MD5STEP(F3, b, c, d, a, in[2] + 0xc4ac5665, 23); + + MD5STEP(F4, a, b, c, d, in[0] + 0xf4292244, 6); + MD5STEP(F4, d, a, b, c, in[7] + 0x432aff97, 10); + MD5STEP(F4, c, d, a, b, in[14] + 0xab9423a7, 15); + MD5STEP(F4, b, c, d, a, in[5] + 0xfc93a039, 21); + MD5STEP(F4, a, b, c, d, in[12] + 0x655b59c3, 6); + MD5STEP(F4, d, a, b, c, in[3] + 0x8f0ccc92, 10); + MD5STEP(F4, c, d, a, b, in[10] + 0xffeff47d, 15); + MD5STEP(F4, b, c, d, a, in[1] + 0x85845dd1, 21); + MD5STEP(F4, a, b, c, d, in[8] + 0x6fa87e4f, 6); + MD5STEP(F4, d, a, b, c, in[15] + 0xfe2ce6e0, 10); + MD5STEP(F4, c, d, a, b, in[6] + 0xa3014314, 15); + MD5STEP(F4, b, c, d, a, in[13] + 0x4e0811a1, 21); + MD5STEP(F4, a, b, c, d, in[4] + 0xf7537e82, 6); + MD5STEP(F4, d, a, b, c, in[11] + 0xbd3af235, 10); + MD5STEP(F4, c, d, a, b, in[2] + 0x2ad7d2bb, 15); + MD5STEP(F4, b, c, d, a, in[9] + 0xeb86d391, 21); + + hash[0] += a; + hash[1] += b; + hash[2] += c; + hash[3] += d; +} + +#endif + diff --git a/core/lb_parser.c b/core/lb_parser.c new file mode 100644 index 0000000..f7f89f0 --- /dev/null +++ b/core/lb_parser.c @@ -0,0 +1,820 @@ +/* Copyright (c) 2018. TIG developer. */ + +/*- + * BSD LICENSE + * + * Copyright(c) 2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * For my_ether_aton() function: + * + * Copyright (c) 2009, Olivier MATZ + * All rights reserved. + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the University of California, Berkeley nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE REGENTS AND CONTRIBUTORS BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * For inet_pton4() and inet_pton6() functions: + * + * Copyright (c) 1996 by Internet Software Consortium. + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SOFTWARE CONSORTIUM DISCLAIMS + * ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL INTERNET SOFTWARE + * CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL + * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR + * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS + * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "lb_parser.h" + +static uint32_t +get_hex_val(char c) +{ + switch (c) { + case '0': case '1': case '2': case '3': case '4': case '5': + case '6': case '7': case '8': case '9': + return c - '0'; + case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': + return c - 'A' + 10; + case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': + return c - 'a' + 10; + default: + return 0; + } +} + +int +parser_read_arg_bool(const char *p) +{ + p = skip_white_spaces(p); + int result = -EINVAL; + + if (((p[0] == 'y') && (p[1] == 'e') && (p[2] == 's')) || + ((p[0] == 'Y') && (p[1] == 'E') && (p[2] == 'S'))) { + p += 3; + result = 1; + } + + if (((p[0] == 'o') && (p[1] == 'n')) || + ((p[0] == 'O') && (p[1] == 'N'))) { + p += 2; + result = 1; + } + + if (((p[0] == 'n') && (p[1] == 'o')) || + ((p[0] == 'N') && (p[1] == 'O'))) { + p += 2; + result = 0; + } + + if (((p[0] == 'o') && (p[1] == 'f') && (p[2] == 'f')) || + ((p[0] == 'O') && (p[1] == 'F') && (p[2] == 'F'))) { + p += 3; + result = 0; + } + + p = skip_white_spaces(p); + + if (p[0] != '\0') + return -EINVAL; + + return result; +} + +int +parser_read_uint64(uint64_t *value, const char *p) +{ + char *next; + uint64_t val; + + p = skip_white_spaces(p); + if (!isdigit(*p)) + return -EINVAL; + + val = strtoul(p, &next, 10); + if (p == next) + return -EINVAL; + + p = next; + switch (*p) { + case 'T': + val *= 1024ULL; + /* fall through */ + case 'G': + val *= 1024ULL; + /* fall through */ + case 'M': + val *= 1024ULL; + /* fall through */ + case 'k': + case 'K': + val *= 1024ULL; + p++; + break; + } + + p = skip_white_spaces(p); + if (*p != '\0') + return -EINVAL; + + *value = val; + return 0; +} + +int +parser_read_uint64_hex(uint64_t *value, const char *p) +{ + char *next; + uint64_t val; + + p = skip_white_spaces(p); + + val = strtoul(p, &next, 16); + if (p == next) + return -EINVAL; + + p = skip_white_spaces(next); + if (*p != '\0') + return -EINVAL; + + *value = val; + return 0; +} + +int +parser_read_int32(int32_t *value, const char *p) +{ + char *next; + int32_t val; + + p = skip_white_spaces(p); + if (!isdigit(*p)) + return -EINVAL; + + val = strtol(p, &next, 10); + if (p == next) + return -EINVAL; + + *value = val; + return 0; +} + +int +parser_read_uint32(uint32_t *value, const char *p) +{ + uint64_t val = 0; + int ret = parser_read_uint64(&val, p); + + if (ret < 0) + return ret; + + if (val > UINT32_MAX) + return -ERANGE; + + *value = val; + return 0; +} + +int +parser_read_uint32_hex(uint32_t *value, const char *p) +{ + uint64_t val = 0; + int ret = parser_read_uint64_hex(&val, p); + + if (ret < 0) + return ret; + + if (val > UINT32_MAX) + return -ERANGE; + + *value = val; + return 0; +} + +int +parser_read_uint16(uint16_t *value, const char *p) +{ + uint64_t val = 0; + int ret = parser_read_uint64(&val, p); + + if (ret < 0) + return ret; + + if (val > UINT16_MAX) + return -ERANGE; + + *value = val; + return 0; +} + +int +parser_read_uint16_hex(uint16_t *value, const char *p) +{ + uint64_t val = 0; + int ret = parser_read_uint64_hex(&val, p); + + if (ret < 0) + return ret; + + if (val > UINT16_MAX) + return -ERANGE; + + *value = val; + return 0; +} + +int +parser_read_uint8(uint8_t *value, const char *p) +{ + uint64_t val = 0; + int ret = parser_read_uint64(&val, p); + + if (ret < 0) + return ret; + + if (val > UINT8_MAX) + return -ERANGE; + + *value = val; + return 0; +} + +int +parser_read_uint8_hex(uint8_t *value, const char *p) +{ + uint64_t val = 0; + int ret = parser_read_uint64_hex(&val, p); + + if (ret < 0) + return ret; + + if (val > UINT8_MAX) + return -ERANGE; + + *value = val; + return 0; +} + +int +parse_tokenize_string(char *string, char *tokens[], uint32_t *n_tokens) +{ + uint32_t i; + + if ((string == NULL) || + (tokens == NULL) || + (*n_tokens < 1)) + return -EINVAL; + + for (i = 0; i < *n_tokens; i++) { + tokens[i] = strtok_r(string, PARSE_DELIMITER, &string); + if (tokens[i] == NULL) + break; + } + + if ((i == *n_tokens) && + (NULL != strtok_r(string, PARSE_DELIMITER, &string))) + return -E2BIG; + + *n_tokens = i; + return 0; +} + +int +parse_hex_string(char *src, uint8_t *dst, uint32_t *size) +{ + char *c; + uint32_t len, i; + + /* Check input parameters */ + if ((src == NULL) || + (dst == NULL) || + (size == NULL) || + (*size == 0)) + return -1; + + len = strlen(src); + if (((len & 3) != 0) || + (len > (*size) * 2)) + return -1; + *size = len / 2; + + for (c = src; *c != 0; c++) { + if ((((*c) >= '0') && ((*c) <= '9')) || + (((*c) >= 'A') && ((*c) <= 'F')) || + (((*c) >= 'a') && ((*c) <= 'f'))) + continue; + + return -1; + } + + /* Convert chars to bytes */ + for (i = 0; i < *size; i++) + dst[i] = get_hex_val(src[2 * i]) * 16 + + get_hex_val(src[2 * i + 1]); + + return 0; +} + +int +parse_mpls_labels(char *string, uint32_t *labels, uint32_t *n_labels) +{ + uint32_t n_max_labels = *n_labels, count = 0; + + /* Check for void list of labels */ + if (strcmp(string, "") == 0) { + *n_labels = 0; + return 0; + } + + /* At least one label should be present */ + for ( ; (*string != '\0'); ) { + char *next; + int value; + + if (count >= n_max_labels) + return -1; + + if (count > 0) { + if (string[0] != ':') + return -1; + + string++; + } + + value = strtol(string, &next, 10); + if (next == string) + return -1; + string = next; + + labels[count++] = (uint32_t) value; + } + + *n_labels = count; + return 0; +} + +#define INADDRSZ 4 +#define IN6ADDRSZ 16 + +/* int + * inet_pton4(src, dst) + * like inet_aton() but without all the hexadecimal and shorthand. + * return: + * 1 if `src' is a valid dotted quad, else 0. + * notice: + * does not touch `dst' unless it's returning 1. + * author: + * Paul Vixie, 1996. + */ +static int +inet_pton4(const char *src, unsigned char *dst) +{ + static const char digits[] = "0123456789"; + int saw_digit, octets, ch; + unsigned char tmp[INADDRSZ], *tp; + + saw_digit = 0; + octets = 0; + *(tp = tmp) = 0; + while ((ch = *src++) != '\0') { + const char *pch; + + pch = strchr(digits, ch); + if (pch != NULL) { + unsigned int new = *tp * 10 + (pch - digits); + + if (new > 255) + return 0; + if (!saw_digit) { + if (++octets > 4) + return 0; + saw_digit = 1; + } + *tp = (unsigned char)new; + } else if (ch == '.' && saw_digit) { + if (octets == 4) + return 0; + *++tp = 0; + saw_digit = 0; + } else + return 0; + } + if (octets < 4) + return 0; + + memcpy(dst, tmp, INADDRSZ); + return 1; +} + +/* int + * inet_pton6(src, dst) + * convert presentation level address to network order binary form. + * return: + * 1 if `src' is a valid [RFC1884 2.2] address, else 0. + * notice: + * (1) does not touch `dst' unless it's returning 1. + * (2) :: in a full address is silently ignored. + * credit: + * inspired by Mark Andrews. + * author: + * Paul Vixie, 1996. + */ +static int +inet_pton6(const char *src, unsigned char *dst) +{ + static const char xdigits_l[] = "0123456789abcdef", + xdigits_u[] = "0123456789ABCDEF"; + unsigned char tmp[IN6ADDRSZ], *tp = 0, *endp = 0, *colonp = 0; + const char *xdigits = 0, *curtok = 0; + int ch = 0, saw_xdigit = 0, count_xdigit = 0; + unsigned int val = 0; + unsigned dbloct_count = 0; + + memset((tp = tmp), '\0', IN6ADDRSZ); + endp = tp + IN6ADDRSZ; + colonp = NULL; + /* Leading :: requires some special handling. */ + if (*src == ':') + if (*++src != ':') + return 0; + curtok = src; + saw_xdigit = count_xdigit = 0; + val = 0; + + while ((ch = *src++) != '\0') { + const char *pch; + + pch = strchr((xdigits = xdigits_l), ch); + if (pch == NULL) + pch = strchr((xdigits = xdigits_u), ch); + if (pch != NULL) { + if (count_xdigit >= 4) + return 0; + val <<= 4; + val |= (pch - xdigits); + if (val > 0xffff) + return 0; + saw_xdigit = 1; + count_xdigit++; + continue; + } + if (ch == ':') { + curtok = src; + if (!saw_xdigit) { + if (colonp) + return 0; + colonp = tp; + continue; + } else if (*src == '\0') { + return 0; + } + if (tp + sizeof(int16_t) > endp) + return 0; + *tp++ = (unsigned char) ((val >> 8) & 0xff); + *tp++ = (unsigned char) (val & 0xff); + saw_xdigit = 0; + count_xdigit = 0; + val = 0; + dbloct_count++; + continue; + } + if (ch == '.' && ((tp + INADDRSZ) <= endp) && + inet_pton4(curtok, tp) > 0) { + tp += INADDRSZ; + saw_xdigit = 0; + dbloct_count += 2; + break; /* '\0' was seen by inet_pton4(). */ + } + return 0; + } + if (saw_xdigit) { + if (tp + sizeof(int16_t) > endp) + return 0; + *tp++ = (unsigned char) ((val >> 8) & 0xff); + *tp++ = (unsigned char) (val & 0xff); + dbloct_count++; + } + if (colonp != NULL) { + /* if we already have 8 double octets, having a colon means error */ + if (dbloct_count == 8) + return 0; + + /* + * Since some memmove()'s erroneously fail to handle + * overlapping regions, we'll do the shift by hand. + */ + const int n = tp - colonp; + int i; + + for (i = 1; i <= n; i++) { + endp[-i] = colonp[n - i]; + colonp[n - i] = 0; + } + tp = endp; + } + if (tp != endp) + return 0; + memcpy(dst, tmp, IN6ADDRSZ); + return 1; +} + +static struct ether_addr * +my_ether_aton(const char *a) +{ + int i; + char *end; + unsigned long o[ETHER_ADDR_LEN]; + static struct ether_addr ether_addr; + + i = 0; + do { + errno = 0; + o[i] = strtoul(a, &end, 16); + if (errno != 0 || end == a || (end[0] != ':' && end[0] != 0)) + return NULL; + a = end + 1; + } while (++i != sizeof(o) / sizeof(o[0]) && end[0] != 0); + + /* Junk at the end of line */ + if (end[0] != 0) + return NULL; + + /* Support the format XX:XX:XX:XX:XX:XX */ + if (i == ETHER_ADDR_LEN) { + while (i-- != 0) { + if (o[i] > UINT8_MAX) + return NULL; + ether_addr.addr_bytes[i] = (uint8_t)o[i]; + } + /* Support the format XXXX:XXXX:XXXX */ + } else if (i == ETHER_ADDR_LEN / 2) { + while (i-- != 0) { + if (o[i] > UINT16_MAX) + return NULL; + ether_addr.addr_bytes[i * 2] = (uint8_t)(o[i] >> 8); + ether_addr.addr_bytes[i * 2 + 1] = (uint8_t)(o[i] & 0xff); + } + /* unknown format */ + } else + return NULL; + + return (struct ether_addr *)ðer_addr; +} + +int +parse_ipv4_addr(const char *token, struct in_addr *ipv4) +{ + if (strlen(token) >= INET_ADDRSTRLEN) + return -EINVAL; + + if (inet_pton4(token, (unsigned char *)ipv4) != 1) + return -EINVAL; + + return 0; +} + +int +parse_ipv6_addr(const char *token, struct in6_addr *ipv6) +{ + if (strlen(token) >= INET6_ADDRSTRLEN) + return -EINVAL; + + if (inet_pton6(token, (unsigned char *)ipv6) != 1) + return -EINVAL; + + return 0; +} + +int +parse_mac_addr(const char *token, struct ether_addr *addr) +{ + struct ether_addr *tmp; + + tmp = my_ether_aton(token); + if (tmp == NULL) + return -1; + + memcpy(addr, tmp, sizeof(struct ether_addr)); + return 0; +} + +int +parse_l4_proto(const char *token, uint8_t *proto) +{ + if (strcasecmp(token, "tcp") == 0) { + *proto = IPPROTO_TCP; + return 0; + } + + if (strcasecmp(token, "udp") == 0) { + *proto = IPPROTO_UDP; + return 0; + } + + return -1; +} + +int +parse_pipeline_core(uint32_t *socket, + uint32_t *core, + uint32_t *ht, + const char *entry) +{ + size_t num_len; + char num[8]; + + uint32_t s = 0, c = 0, h = 0, val; + uint8_t s_parsed = 0, c_parsed = 0, h_parsed = 0; + const char *next = skip_white_spaces(entry); + char type; + + /* Expect or [sX][cY][h]. At least one parameter is required. */ + while (*next != '\0') { + /* If everything parsed nothing should left */ + if (s_parsed && c_parsed && h_parsed) + return -EINVAL; + + type = *next; + switch (type) { + case 's': + case 'S': + if (s_parsed || c_parsed || h_parsed) + return -EINVAL; + s_parsed = 1; + next++; + break; + case 'c': + case 'C': + if (c_parsed || h_parsed) + return -EINVAL; + c_parsed = 1; + next++; + break; + case 'h': + case 'H': + if (h_parsed) + return -EINVAL; + h_parsed = 1; + next++; + break; + default: + /* If it start from digit it must be only core id. */ + if (!isdigit(*next) || s_parsed || c_parsed || h_parsed) + return -EINVAL; + + type = 'C'; + } + + for (num_len = 0; *next != '\0'; next++, num_len++) { + if (num_len == RTE_DIM(num)) + return -EINVAL; + + if (!isdigit(*next)) + break; + + num[num_len] = *next; + } + + if (num_len == 0 && type != 'h' && type != 'H') + return -EINVAL; + + if (num_len != 0 && (type == 'h' || type == 'H')) + return -EINVAL; + + num[num_len] = '\0'; + val = strtol(num, NULL, 10); + + h = 0; + switch (type) { + case 's': + case 'S': + s = val; + break; + case 'c': + case 'C': + c = val; + break; + case 'h': + case 'H': + h = 1; + break; + } + } + + *socket = s; + *core = c; + *ht = h; + return 0; +} + +int +str_split(char *str, const char *delim, char *tokens[], int limit) +{ + char *p; + int count = 0; + + p = strtok(str, delim); + while (p != NULL && count < limit) { + tokens[count++] = p; + p = strtok(NULL, delim); + } + return count; +} + +int +parse_ipv4_port(const char *token, uint32_t *ip, uint16_t *port) +{ + char *t, *p; + + t = strdup(token); + if (t == NULL) + return -1; + p = strtok(t, ":"); + if (!p || parse_ipv4_addr(p, (struct in_addr *)ip) < 0) { + free(t); + return -1; + } + p = strtok(NULL, ":"); + if (!p || parser_read_uint16(port, p) < 0) { + free(t); + return -1; + } + *port = htons(*port); + free(t); + return 0; +} + diff --git a/core/lb_parser.h b/core/lb_parser.h new file mode 100644 index 0000000..f762803 --- /dev/null +++ b/core/lb_parser.h @@ -0,0 +1,116 @@ +/* Copyright (c) 2018. TIG developer. */ + +/*- + * BSD LICENSE + * + * Copyright(c) 2010-2016 Intel Corporation. All rights reserved. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef __INCLUDE_PARSER_H__ +#define __INCLUDE_PARSER_H__ + +#include + +#include +#include + +#define PARSE_DELIMITER " \f\n\r\t\v" + +#define skip_white_spaces(pos) \ +({ \ + __typeof__(pos) _p = (pos); \ + for ( ; isspace(*_p); _p++) \ + ; \ + _p; \ +}) + +static inline size_t +skip_digits(const char *src) +{ + size_t i; + + for (i = 0; isdigit(src[i]); i++) + ; + + return i; +} + +int parser_read_arg_bool(const char *p); + +int parser_read_int32(int32_t *value, const char *p); +int parser_read_uint64(uint64_t *value, const char *p); +int parser_read_uint32(uint32_t *value, const char *p); +int parser_read_uint16(uint16_t *value, const char *p); +int parser_read_uint8(uint8_t *value, const char *p); + +int parser_read_uint64_hex(uint64_t *value, const char *p); +int parser_read_uint32_hex(uint32_t *value, const char *p); +int parser_read_uint16_hex(uint16_t *value, const char *p); +int parser_read_uint8_hex(uint8_t *value, const char *p); + +int parse_hex_string(char *src, uint8_t *dst, uint32_t *size); + +int parse_ipv4_addr(const char *token, struct in_addr *ipv4); +int parse_ipv6_addr(const char *token, struct in6_addr *ipv6); +int parse_mac_addr(const char *token, struct ether_addr *addr); +int parse_mpls_labels(char *string, uint32_t *labels, uint32_t *n_labels); +int parse_l4_proto(const char *token, uint8_t *proto); + +int parse_tokenize_string(char *string, char *tokens[], uint32_t *n_tokens); + +int parse_pipeline_core(uint32_t *socket, uint32_t *core, uint32_t *ht, const char *entry); + +int str_split(char *str, const char *delim, char *tokens[], int limit); +int parse_ipv4_port(const char *token, uint32_t *ip, uint16_t *port); + +static inline void +mac_addr_tostring(struct ether_addr *addr, char *buf, size_t len) +{ + snprintf(buf, len, "%02x:%02x:%02x:%02x:%02x:%02x", + addr->addr_bytes[0], + addr->addr_bytes[1], + addr->addr_bytes[2], + addr->addr_bytes[3], + addr->addr_bytes[4], + addr->addr_bytes[5]); +} + +static inline void +ipv4_addr_tostring(uint32_t ipv4, char *buf, size_t len) +{ + ipv4 = rte_be_to_cpu_32(ipv4); + snprintf(buf, len, "%u.%u.%u.%u", + (unsigned char)(ipv4 >> 24 & 0xff), + (unsigned char)(ipv4 >> 16 & 0xff), + (unsigned char)(ipv4 >> 8 & 0xff), + (unsigned char)(ipv4 & 0xff)); +} + +#endif diff --git a/core/lb_proto.c b/core/lb_proto.c new file mode 100644 index 0000000..92adece --- /dev/null +++ b/core/lb_proto.c @@ -0,0 +1,29 @@ +/* Copyright (c) 2018. TIG developer. */ + +#include + +#include + +#include "lb_proto.h" + +struct lb_proto *lb_protos[LB_IPPROTO_MAX]; + +enum lb_proto_type lb_proto_types[IPPROTO_MAX]; + +int +lb_proto_init(void) { + uint16_t i; + + RTE_LOG(INFO, USER1, "%s(): lb_protos[%u] size = %luKB\n", __func__, + LB_IPPROTO_MAX, (sizeof(lb_protos) + 1023) / 1024); + + for (i = 0; i < LB_IPPROTO_MAX; i++) { + if (lb_protos[i] != NULL && lb_protos[i]->init() < 0) { + RTE_LOG(ERR, USER1, "%s(): proto[%u] init failed.\n", __func__, i); + return -1; + } + } + + return 0; +} + diff --git a/core/lb_proto.h b/core/lb_proto.h new file mode 100644 index 0000000..2a7c081 --- /dev/null +++ b/core/lb_proto.h @@ -0,0 +1,111 @@ +/* Copyright (c) 2018. TIG developer. */ + +#ifndef __LB_PROTO_H__ +#define __LB_PROTO_H__ + +#include + +#include +#include +#include +#include + +enum lb_proto_type { + LB_IPPROTO_TCP, + LB_IPPROTO_UDP, + LB_IPPROTO_ICMP, + LB_IPPROTO_MAX, +}; + +enum { + LB_DIR_ORIGINAL, + LB_DIR_REPLY, + LB_DIR_MAX, +}; + +struct lb_proto { + uint8_t id; + enum lb_proto_type type; + int (*init)(void); + int (*fullnat_handle)(struct rte_mbuf *, struct ipv4_hdr *, uint16_t); +}; + +#define IPv4_HLEN(iph) (((iph)->version_ihl & IPV4_HDR_IHL_MASK) << 2) +#define TCP_HDR(iph) (struct tcp_hdr *)((char *)(iph) + IPv4_HLEN(iph)) +#define UDP_HDR(iph) (struct udp_hdr *)((char *)(iph) + IPv4_HLEN(iph)) + +#define SYN(th) ((th)->tcp_flags & TCP_SYN_FLAG) +#define ACK(th) ((th)->tcp_flags & TCP_ACK_FLAG) +#define RST(th) ((th)->tcp_flags & TCP_RST_FLAG) +#define FIN(th) ((th)->tcp_flags & TCP_FIN_FLAG) + +/* + * TCP option + */ + +#define TCPOPT_NOP 1 /* Padding */ +#define TCPOPT_EOL 0 /* End of options */ +#define TCPOPT_MSS 2 /* Segment size negotiating */ +#define TCPOPT_WINDOW 3 /* Window scaling */ +#define TCPOPT_SACK_PERM 4 /* SACK Permitted */ +#define TCPOPT_SACK 5 /* SACK Block */ +#define TCPOPT_TIMESTAMP 8 /* Better RTT estimations/PAWS */ +#define TCPOPT_MD5SIG 19 /* MD5 Signature (RFC2385) */ +#define TCPOPT_FASTOPEN 34 /* Fast open (RFC7413) */ +#define TCPOPT_EXP 254 /* Experimental */ + +/* + * TCP option lengths + */ + +#define TCPOLEN_MSS 4 +#define TCPOLEN_WINDOW 3 +#define TCPOLEN_SACK_PERM 2 +#define TCPOLEN_TIMESTAMP 10 +#define TCPOLEN_MD5SIG 18 +#define TCPOLEN_FASTOPEN_BASE 2 +#define TCPOLEN_EXP_FASTOPEN_BASE 4 + +#define TCPOLEN_TSTAMP_ALIGNED 12 +#define TCPOLEN_WSCALE_ALIGNED 4 +#define TCPOLEN_SACKPERM_ALIGNED 4 +#define TCPOLEN_SACK_BASE 2 +#define TCPOLEN_SACK_BASE_ALIGNED 4 +#define TCPOLEN_SACK_PERBLOCK 8 +#define TCPOLEN_MD5SIG_ALIGNED 20 +#define TCPOLEN_MSS_ALIGNED 4 + +enum tcp_conntrack { + TCP_CONNTRACK_NONE, + TCP_CONNTRACK_SYN_SENT, + TCP_CONNTRACK_SYN_RECV, + TCP_CONNTRACK_ESTABLISHED, + TCP_CONNTRACK_FIN_WAIT, + TCP_CONNTRACK_CLOSE_WAIT, + TCP_CONNTRACK_LAST_ACK, + TCP_CONNTRACK_TIME_WAIT, + TCP_CONNTRACK_CLOSE, + TCP_CONNTRACK_LISTEN, /* obsolete */ +#define TCP_CONNTRACK_SYN_SENT2 TCP_CONNTRACK_LISTEN + TCP_CONNTRACK_MAX, + TCP_CONNTRACK_IGNORE +}; + +extern struct lb_proto *lb_protos[LB_IPPROTO_MAX]; +extern enum lb_proto_type lb_proto_types[IPPROTO_MAX]; + +#define LB_PROTO_REGISTER(p) \ + __attribute__((constructor)) static void proto_register_##p(void) { \ + lb_protos[p.type] = &p; \ + lb_proto_types[p.id] = p.type; \ + } + +static inline struct lb_proto * +lb_proto_get(uint8_t id) { + return lb_protos[lb_proto_types[id]]; +} + +int lb_proto_init(void); + +#endif + diff --git a/core/lb_proto_icmp.c b/core/lb_proto_icmp.c new file mode 100644 index 0000000..3b7a54c --- /dev/null +++ b/core/lb_proto_icmp.c @@ -0,0 +1,72 @@ +/* Copyright (c) 2018. TIG developer. */ + +#include +#include +#include +#include + +#include "lb_device.h" +#include "lb_proto.h" +#include "lb_service.h" + +static uint32_t +icmp_cksum(const struct ipv4_hdr *iph, const struct icmp_hdr *icmph) { + uint16_t cksum; + size_t len; + + len = rte_be_to_cpu_16(iph->total_length) - sizeof(struct ipv4_hdr); + cksum = rte_raw_cksum(icmph, len); + return (cksum == 0xffff) ? cksum : ~cksum; +} + +static int +icmp_fullnat_handle(struct rte_mbuf *m, struct ipv4_hdr *iph, + uint16_t port_id) { + struct icmp_hdr *icmph; + uint32_t tmpaddr; + + if (rte_ipv4_frag_pkt_is_fragmented(iph)) { + rte_pktmbuf_free(m); + return 0; + } + + if (!lb_is_vip_exist(iph->dst_addr) && + !lb_is_laddr_exist(iph->dst_addr, port_id)) { + rte_pktmbuf_free(m); + return 0; + } + + icmph = (struct icmp_hdr *)((char *)iph + IPv4_HLEN(iph)); + if (!((icmph->icmp_type == IP_ICMP_ECHO_REQUEST) && + (icmph->icmp_code == 0))) { + rte_pktmbuf_free(m); + return 0; + } + + tmpaddr = iph->src_addr; + iph->src_addr = iph->dst_addr; + iph->dst_addr = tmpaddr; + iph->hdr_checksum = 0; + iph->hdr_checksum = rte_ipv4_cksum(iph); + + icmph->icmp_type = IP_ICMP_ECHO_REPLY; + icmph->icmp_cksum = 0; + icmph->icmp_cksum = icmp_cksum(iph, icmph); + + return lb_device_output(m, iph, port_id); +} + +static int +icmp_init(void) { + return 0; +} + +static struct lb_proto proto_icmp = { + .id = IPPROTO_ICMP, + .type = LB_IPPROTO_ICMP, + .init = icmp_init, + .fullnat_handle = icmp_fullnat_handle, +}; + +LB_PROTO_REGISTER(proto_icmp); + diff --git a/core/lb_proto_tcp.c b/core/lb_proto_tcp.c new file mode 100644 index 0000000..e942dcc --- /dev/null +++ b/core/lb_proto_tcp.c @@ -0,0 +1,718 @@ +/* Copyright (c) 2018. TIG developer. */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "lb_clock.h" +#include "lb_conn.h" +#include "lb_device.h" +#include "lb_format.h" +#include "lb_proto.h" +#include "lb_synproxy.h" +#include "lb_tcp_secret_seq.h" +#include "lb_toa.h" + +//#define TCP_DEBUG +#ifdef TCP_DEBUG +#define TCP_PRINT(...) \ + do { \ + fprintf(stderr, "[core%u]TCP: ", rte_lcore_id()); \ + fprintf(stderr, __VA_ARGS__); \ + } while (0) +#else +#define TCP_PRINT(...) \ + do { \ + } while (0) +#endif + +#define IPv4_TCP_FMT IPv4_BE_FMT ":%u -> " IPv4_BE_FMT ":%u [%c%c%c%c]" +#define IPv4_TCP_ARG(iph, th) \ + IPv4_BE_ARG((iph)->src_addr), rte_be_to_cpu_16((th)->src_port), \ + IPv4_BE_ARG((iph)->dst_addr), rte_be_to_cpu_16((th)->dst_port), \ + SYN(th) ? 'S' : '-', ACK(th) ? 'A' : '-', RST(th) ? 'R' : '-', \ + FIN(th) ? 'F' : '-' + +#define sNO TCP_CONNTRACK_NONE +#define sSS TCP_CONNTRACK_SYN_SENT +#define sSR TCP_CONNTRACK_SYN_RECV +#define sES TCP_CONNTRACK_ESTABLISHED +#define sFW TCP_CONNTRACK_FIN_WAIT +#define sCW TCP_CONNTRACK_CLOSE_WAIT +#define sLA TCP_CONNTRACK_LAST_ACK +#define sTW TCP_CONNTRACK_TIME_WAIT +#define sCL TCP_CONNTRACK_CLOSE +#define sS2 TCP_CONNTRACK_SYN_SENT2 +#define sIV TCP_CONNTRACK_MAX +#define sIG TCP_CONNTRACK_IGNORE + +enum tcp_bit_set { + TCP_SYN_SET, + TCP_SYNACK_SET, + TCP_FIN_SET, + TCP_ACK_SET, + TCP_RST_SET, + TCP_NONE_SET, +}; + +/* + * The TCP state transition table needs a few words... + * + * We are the man in the middle. All the packets go through us + * but might get lost in transit to the destination. + * It is assumed that the destinations can't receive segments + * we haven't seen. + * + * The checked segment is in window, but our windows are *not* + * equivalent with the ones of the sender/receiver. We always + * try to guess the state of the current sender. + * + * The meaning of the states are: + * + * NONE: initial state + * SYN_SENT: SYN-only packet seen + * SYN_SENT2: SYN-only packet seen from reply dir, simultaneous open + * SYN_RECV: SYN-ACK packet seen + * ESTABLISHED: ACK packet seen + * FIN_WAIT: FIN packet seen + * CLOSE_WAIT: ACK seen (after FIN) + * LAST_ACK: FIN seen (after FIN) + * TIME_WAIT: last ACK seen + * CLOSE: closed connection (RST) + * + * Packets marked as IGNORED (sIG): + * if they may be either invalid or valid + * and the receiver may send back a connection + * closing RST or a SYN/ACK. + * + * Packets marked as INVALID (sIV): + * if we regard them as truly invalid packets + */ +static const uint8_t tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { + {/* ORIGINAL */ + /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ + /*syn*/ {sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sS2}, + /* + * sNO -> sSS Initialize a new connection + * sSS -> sSS Retransmitted SYN + * sS2 -> sS2 Late retransmitted SYN + * sSR -> sIG + * sES -> sIG Error: SYNs in window outside the SYN_SENT state + * are errors. Receiver will reply with RST + * and close the connection. + * Or we are not in sync and hold a dead connection. + * sFW -> sIG + * sCW -> sIG + * sLA -> sIG + * sTW -> sSS Reopened connection (RFC 1122). + * sCL -> sSS + */ + /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ + /*synack*/ {sIV, sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR}, + /* + * sNO -> sIV Too late and no reason to do anything + * sSS -> sIV Client can't send SYN and then SYN/ACK + * sS2 -> sSR SYN/ACK sent to SYN2 in simultaneous open + * sSR -> sIG + * sES -> sIG Error: SYNs in window outside the SYN_SENT state + * are errors. Receiver will reply with RST + * and close the connection. + * Or we are not in sync and hold a dead connection. + * sFW -> sIG + * sCW -> sIG + * sLA -> sIG + * sTW -> sIG + * sCL -> sIG + */ + /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ + /*fin*/ {sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV}, + /* + * sNO -> sIV Too late and no reason to do anything... + * sSS -> sIV Client migth not send FIN in this state: + * we enforce waiting for a SYN/ACK reply first. + * sS2 -> sIV + * sSR -> sFW Close started. + * sES -> sFW + * sFW -> sLA FIN seen in both directions, waiting for + * the last ACK. + * Migth be a retransmitted FIN as well... + * sCW -> sLA + * sLA -> sLA Retransmitted FIN. Remain in the same state. + * sTW -> sTW + * sCL -> sCL + */ + /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ + /*ack*/ {sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV}, + /* + * sNO -> sES Assumed. + * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet. + * sS2 -> sIV + * sSR -> sES Established state is reached. + * sES -> sES :-) + * sFW -> sCW Normal close request answered by ACK. + * sCW -> sCW + * sLA -> sTW Last ACK detected. + * sTW -> sTW Retransmitted last ACK. Remain in the same state. + * sCL -> sCL + */ + /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ + /*rst*/ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, + /*none*/ {sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV}}, + {/* REPLY */ + /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ + /*syn*/ {sIV, sS2, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sS2}, + /* + * sNO -> sIV Never reached. + * sSS -> sS2 Simultaneous open + * sS2 -> sS2 Retransmitted simultaneous SYN + * sSR -> sIV Invalid SYN packets sent by the server + * sES -> sIV + * sFW -> sIV + * sCW -> sIV + * sLA -> sIV + * sTW -> sIV Reopened connection, but server may not do it. + * sCL -> sIV + */ + /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ + /*synack*/ {sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sSR}, + /* + * sSS -> sSR Standard open. + * sS2 -> sSR Simultaneous open + * sSR -> sSR Retransmitted SYN/ACK. + * sES -> sIG Late retransmitted SYN/ACK? + * sFW -> sIG Might be SYN/ACK answering ignored SYN + * sCW -> sIG + * sLA -> sIG + * sTW -> sIG + * sCL -> sIG + */ + /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ + /*fin*/ {sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV}, + /* + * sSS -> sIV Server might not send FIN in this state. + * sS2 -> sIV + * sSR -> sFW Close started. + * sES -> sFW + * sFW -> sLA FIN seen in both directions. + * sCW -> sLA + * sLA -> sLA Retransmitted FIN. + * sTW -> sTW + * sCL -> sCL + */ + /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ + /*ack*/ {sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIG}, + /* + * sSS -> sIG Might be a half-open connection. + * sS2 -> sIG + * sSR -> sSR Might answer late resent SYN. + * sES -> sES :-) + * sFW -> sCW Normal close request answered by ACK. + * sCW -> sCW + * sLA -> sTW Last ACK detected. + * sTW -> sTW Retransmitted last ACK. + * sCL -> sCL + */ + /* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ + /*rst*/ {sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL}, + /*none*/ {sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV}}}; + +uint32_t tcp_timeouts[TCP_CONNTRACK_MAX] = { + [TCP_CONNTRACK_NONE] = 2 * LB_CLOCK_HZ, + [TCP_CONNTRACK_SYN_SENT] = 30 * LB_CLOCK_HZ, + [TCP_CONNTRACK_SYN_RECV] = 30 * LB_CLOCK_HZ, + [TCP_CONNTRACK_ESTABLISHED] = 90 * LB_CLOCK_HZ, + [TCP_CONNTRACK_FIN_WAIT] = 30 * LB_CLOCK_HZ, + [TCP_CONNTRACK_CLOSE_WAIT] = 30 * LB_CLOCK_HZ, + [TCP_CONNTRACK_LAST_ACK] = 30 * LB_CLOCK_HZ, + [TCP_CONNTRACK_TIME_WAIT] = 0 * LB_CLOCK_HZ, + [TCP_CONNTRACK_CLOSE] = 0 * LB_CLOCK_HZ, + [TCP_CONNTRACK_LISTEN] = 0 * LB_CLOCK_HZ, +}; + +static const char *const tcp_conntrack_names[] = { + "NONE", "SYN_SENT", "SYN_RECV", "ESTABLISHED", "FIN_WAIT", + "CLOSE_WAIT", "LAST_ACK", "TIME_WAIT", "CLOSE", "SYN_SENT2", +}; + +static struct lb_conn_table lb_conn_tbls[RTE_MAX_LCORE]; + +static inline uint32_t +get_conntrack_index(const struct tcp_hdr *th) { + if (RST(th)) + return TCP_RST_SET; + else if (SYN(th)) + return (ACK(th) ? TCP_SYNACK_SET : TCP_SYN_SET); + else if (FIN(th)) + return TCP_FIN_SET; + else if (ACK(th)) + return TCP_ACK_SET; + else + return TCP_NONE_SET; +} + +static void +tcp_set_conntack_state(struct lb_conn *conn, struct tcp_hdr *th, int dir) { + uint32_t index; + uint32_t old_state; + uint32_t new_state; + uint32_t lcore_id = rte_lcore_id(); + struct lb_real_service *rs = conn->real_service; + struct lb_virt_service *vs = rs->virt_service; + uint32_t timeout; + + index = get_conntrack_index(th); + old_state = conn->state; + new_state = tcp_conntracks[dir][index][old_state]; + if (!(conn->flags & LB_CONN_F_ACTIVE) && + (new_state == TCP_CONNTRACK_ESTABLISHED)) { + conn->flags |= LB_CONN_F_ACTIVE; + rte_atomic32_add(&rs->active_conns, 1); + rte_atomic32_add(&vs->active_conns, 1); + vs->stats[lcore_id].conns += 1; + rs->stats[lcore_id].conns += 1; + } else if ((conn->flags & LB_CONN_F_ACTIVE) && + (new_state != TCP_CONNTRACK_ESTABLISHED)) { + conn->flags &= ~LB_CONN_F_ACTIVE; + rte_atomic32_add(&rs->active_conns, -1); + rte_atomic32_add(&vs->active_conns, -1); + } + if (new_state < TCP_CONNTRACK_MAX) { + conn->state = new_state; + conn->timeout = tcp_timeouts[new_state]; + } + if (new_state == TCP_CONNTRACK_ESTABLISHED) { + timeout = vs->est_timeout; + conn->timeout = + timeout != 0 ? timeout : tcp_timeouts[TCP_CONNTRACK_ESTABLISHED]; + } + + if (conn->state == TCP_CONNTRACK_CLOSE || + conn->state == TCP_CONNTRACK_TIME_WAIT) { + TCP_PRINT("conn:{cip=%u.%u.%u.%u cport=%u, " + "vip=%u.%u.%u.%u, vport=%u, lip=%u.%u.%u.%u, " + "lport=%u,rip=%u.%u.%u.%u, rport=%u, %c%c%c%c, state=%s}\n", + IPv4_BE_ARG(conn->cip), rte_be_to_cpu_16(conn->cport), + IPv4_BE_ARG(conn->vip), rte_be_to_cpu_16(conn->vport), + IPv4_BE_ARG(conn->lip), rte_be_to_cpu_16(conn->lport), + IPv4_BE_ARG(conn->rip), rte_be_to_cpu_16(conn->rport), + SYN(th) ? 'S' : '-', ACK(th) ? 'A' : '-', RST(th) ? 'R' : '-', + FIN(th) ? 'F' : '-', + conn->state == TCP_CONNTRACK_CLOSE ? "close" : "timewait"); + } +} + +static void +tcp_set_packet_stats(struct lb_conn *conn, struct rte_mbuf *m, uint8_t dir) { + struct lb_real_service *rs; + struct lb_virt_service *vs; + uint32_t cid; + + cid = rte_lcore_id(); + rs = conn->real_service; + vs = rs->virt_service; + vs->stats[cid].bytes[dir] += m->pkt_len; + vs->stats[cid].packets[dir] += 1; + rs->stats[cid].bytes[dir] += m->pkt_len; + rs->stats[cid].packets[dir] += 1; +} + +static void +tcp_conn_timer_task_cb(struct lb_conn *conn) { + struct rte_mbuf *mcopy; + struct ipv4_hdr *iph; + + if ((conn->flags & LB_CONN_F_SYNPROXY) && + (conn->state == TCP_CONNTRACK_SYN_SENT) && + (conn->proxy.syn_mbuf != NULL)) { + if (conn->proxy.syn_retry == 0) { + rte_pktmbuf_free(conn->proxy.syn_mbuf); + conn->proxy.syn_mbuf = NULL; + } else { + conn->proxy.syn_retry--; + mcopy = rte_pktmbuf_clone(conn->proxy.syn_mbuf, + conn->proxy.syn_mbuf->pool); + if (mcopy != NULL) { + iph = rte_pktmbuf_mtod_offset(mcopy, struct ipv4_hdr *, + ETHER_HDR_LEN); + lb_device_output(mcopy, iph, mcopy->port); + } + } + } +} + +static int +tcp_conn_timer_expire_cb(struct lb_conn *conn, uint32_t ctime) { + /* sent rst to client and real srvice. */ + + if (ctime - conn->use_time > conn->timeout) + return 0; + else + return -1; +} + +static struct lb_conn * +tcp_conn_schedule(struct lb_conn_table *ct, struct ipv4_hdr *iph, + struct tcp_hdr *th, uint16_t port_id) { + struct lb_virt_service *vs; + struct lb_real_service *rs; + struct lb_conn *conn; + + if (!SYN(th) || ACK(th) || RST(th) || FIN(th)) + return NULL; + + vs = lb_vs_get(iph->dst_addr, th->dst_port, iph->next_proto_id); + if (vs == NULL) + return NULL; + + if (!lb_vs_check_max_conn(vs)) { + lb_vs_put(vs); + return NULL; + } + + rs = lb_vs_get_rs(vs, iph->src_addr, th->src_port); + if (rs == NULL) { + lb_vs_put(vs); + return NULL; + } + + conn = lb_conn_new(ct, iph->src_addr, th->src_port, rs, 0, port_id); + if (conn == NULL) { + lb_vs_put(vs); + lb_vs_put_rs(rs); + return NULL; + } + + return conn; +} + +static void +tcp_opt_remove_timestamp(struct tcp_hdr *th) { + uint8_t *ptr; + int len; + uint32_t *tmp; + + ptr = (uint8_t *)(th + 1); + len = (th->data_off >> 2) - sizeof(struct tcp_hdr); + while (len > 0) { + int opcode = *ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: + len--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) + return; + if (opsize > len) + return; + if ((opcode == TCPOPT_TIMESTAMP) && (opsize == TCPOLEN_TIMESTAMP)) { + *(ptr - 2) = TCPOPT_NOP; + *(ptr - 1) = TCPOPT_NOP; + tmp = (uint32_t *)ptr; + *tmp++ = 0x01010101; + *tmp = 0x01010101; + } + ptr += opsize - 2; + len -= opsize; + } + } +} + +static void +tcp_response_rst(struct rte_mbuf *m, struct ipv4_hdr *iph, struct tcp_hdr *th, + uint16_t port_id) { + uint32_t tmpaddr; + struct tcp_hdr *nth; + uint16_t sport, dport; + uint32_t seq, ack; + uint8_t tcp_flags; + + rte_pktmbuf_reset(m); + m->pkt_len = m->data_len = + ETHER_HDR_LEN + sizeof(struct ipv4_hdr) + sizeof(struct tcp_hdr); + + iph->type_of_service = 0; + iph->total_length = + rte_cpu_to_be_16(sizeof(struct ipv4_hdr) + sizeof(struct tcp_hdr)); + iph->packet_id = 0; + iph->fragment_offset = 0; + iph->time_to_live = 63; + tmpaddr = iph->src_addr; + iph->src_addr = iph->dst_addr; + iph->dst_addr = tmpaddr; + iph->hdr_checksum = 0; + iph->hdr_checksum = rte_ipv4_cksum(iph); + + if (ACK(th)) { + seq = th->sent_seq; + ack = 0; + tcp_flags = TCP_RST_FLAG; + } else { + seq = 0; + if (!SYN(th)) + ack = rte_be_to_cpu_32(th->sent_seq); + else + ack = rte_cpu_to_be_32(rte_be_to_cpu_32(th->sent_seq) + 1); + tcp_flags = TCP_RST_FLAG; + tcp_flags |= TCP_ACK_FLAG; + } + sport = th->src_port; + dport = th->dst_port; + + nth = (struct tcp_hdr *)(iph + 1); + nth->src_port = dport; + nth->dst_port = sport; + nth->sent_seq = seq; + nth->recv_ack = ack; + nth->data_off = sizeof(struct tcp_hdr) << 2; + nth->tcp_flags = tcp_flags; + nth->rx_win = 0; + nth->tcp_urp = 0; + nth->cksum = 0; + nth->cksum = rte_ipv4_udptcp_cksum(iph, th); + + lb_device_output(m, iph, port_id); +} + +static int +tcp_fullnat_recv_client(struct rte_mbuf *m, struct ipv4_hdr *iph, + struct tcp_hdr *th, struct lb_conn_table *ct, + struct lb_conn *conn, uint16_t port_id) { + if (conn != NULL) { + if (conn->state == TCP_CONNTRACK_CLOSE) { + tcp_response_rst(m, iph, th, port_id); + return 0; + } + if (conn->state == TCP_CONNTRACK_TIME_WAIT) { + if (conn->flags & LB_CONN_F_SYNPROXY) { + if (!SYN(th) && ACK(th) && !RST(th) && !FIN(th)) { + lb_conn_expire(ct, conn); + conn = NULL; + } + } else { + if (SYN(th) && !ACK(th) && !RST(th) && !FIN(th)) { + lb_conn_expire(ct, conn); + conn = NULL; + } + } + } + } + + if (conn == NULL) { + if (synproxy_recv_client_ack(m, iph, th, ct, port_id) == 0) { + return 0; + } + conn = tcp_conn_schedule(ct, iph, th, port_id); + if (conn == NULL) { + TCP_PRINT(IPv4_TCP_FMT " [CONN SCHEDULE DROP]\n", + IPv4_TCP_ARG(iph, th)); + rte_pktmbuf_free(m); + return 0; + } + } + + if ((conn->flags & LB_CONN_F_SYNPROXY) && + (conn->state == TCP_CONNTRACK_SYN_SENT) && + (!SYN(th) && ACK(th) && !RST(th) && !FIN(th))) { + TCP_PRINT(IPv4_TCP_FMT " [SYNPROXY SYN_SENT DROP]\n", + IPv4_TCP_ARG(iph, th)); + rte_pktmbuf_free(conn->proxy.ack_mbuf); + conn->proxy.ack_mbuf = m; + return 0; + } + + if (!(conn->real_service->flags & LB_RS_F_AVAILABLE)) { + TCP_PRINT(IPv4_TCP_FMT " [RS NOT AVAILABLE DROP]\n", + IPv4_TCP_ARG(iph, th)); + lb_conn_expire(ct, conn); + tcp_response_rst(m, iph, th, port_id); + return 0; + } + + if (SYN(th)) { + tcp_opt_remove_timestamp(th); + tcp_secret_seq_init(conn->lip, conn->rip, conn->lport, conn->rport, + rte_be_to_cpu_32(th->sent_seq), &conn->tseq); + } + + if ((conn->flags & LB_CONN_F_TOA) && + (conn->state == TCP_CONNTRACK_SYN_RECV) && !SYN(th) && ACK(th) && + !RST(th) && !FIN(th)) + tcp_opt_add_toa(m, iph, th, conn->cip, conn->cport); + + tcp_set_conntack_state(conn, th, LB_DIR_ORIGINAL); + tcp_set_packet_stats(conn, m, LB_DIR_ORIGINAL); + + iph->src_addr = conn->lip; + iph->dst_addr = conn->rip; + th->src_port = conn->lport; + th->dst_port = conn->rport; + tcp_secret_seq_adjust_client(th, &conn->tseq); + synproxy_seq_adjust_client(th, &conn->proxy); + iph->hdr_checksum = 0; + iph->hdr_checksum = rte_ipv4_cksum(iph); + th->cksum = 0; + th->cksum = rte_ipv4_udptcp_cksum(iph, th); + + return lb_device_output(m, iph, port_id); +} + +static int +tcp_fullnat_recv_backend(struct rte_mbuf *m, struct ipv4_hdr *iph, + struct tcp_hdr *th, struct lb_conn *conn, + uint16_t port_id) { + if (synproxy_recv_backend_synack(m, iph, th, conn, port_id) == 0) + return 0; + + tcp_set_conntack_state(conn, th, LB_DIR_REPLY); + tcp_set_packet_stats(conn, m, LB_DIR_REPLY); + + iph->src_addr = conn->vip; + iph->dst_addr = conn->cip; + th->src_port = conn->vport; + th->dst_port = conn->cport; + tcp_secret_seq_adjust_backend(th, &conn->tseq); + synproxy_seq_adjust_backend(th, &conn->proxy); + iph->hdr_checksum = 0; + iph->hdr_checksum = rte_ipv4_cksum(iph); + th->cksum = 0; + th->cksum = rte_ipv4_udptcp_cksum(iph, th); + + return lb_device_output(m, iph, port_id); +} + +static int +tcp_fullnat_handle(struct rte_mbuf *m, struct ipv4_hdr *iph, uint16_t port_id) { + struct lb_conn_table *ct; + struct lb_conn *conn; + struct tcp_hdr *th; + uint8_t dir; + + ct = &lb_conn_tbls[rte_lcore_id()]; + th = TCP_HDR(iph); + + TCP_PRINT(IPv4_TCP_FMT " [NEW PACKET]\n", IPv4_TCP_ARG(iph, th)); + + if (synproxy_recv_client_syn(m, iph, th, port_id) == 0) + return 0; + + conn = lb_conn_find(ct, iph->src_addr, iph->dst_addr, th->src_port, + th->dst_port, &dir); + if (dir == LB_DIR_REPLY) { + TCP_PRINT(IPv4_TCP_FMT " [REPLY]\n", IPv4_TCP_ARG(iph, th)); + return tcp_fullnat_recv_backend(m, iph, th, conn, port_id); + } else { + TCP_PRINT(IPv4_TCP_FMT " [ORIGINAL]\n", IPv4_TCP_ARG(iph, th)); + return tcp_fullnat_recv_client(m, iph, th, ct, conn, port_id); + } +} + +static int +tcp_fullnat_init(void) { + uint32_t lcore_id; + struct lb_conn_table *ct; + int rc; + + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + ct = &lb_conn_tbls[lcore_id]; + rc = lb_conn_table_init( + ct, LB_IPPROTO_TCP, lcore_id, tcp_timeouts[TCP_CONNTRACK_NONE], + tcp_conn_timer_task_cb, tcp_conn_timer_expire_cb); + if (rc < 0) { + RTE_LOG(ERR, USER1, "%s(): lb_conn_table_init failed.\n", __func__); + return rc; + } + RTE_LOG(INFO, USER1, "%s(): Create tcp connection table on lcore%u.\n", + __func__, lcore_id); + } + + return 0; +} + +static struct lb_proto proto_tcp = { + .id = IPPROTO_TCP, + .type = LB_IPPROTO_TCP, + .init = tcp_fullnat_init, + .fullnat_handle = tcp_fullnat_handle, +}; + +LB_PROTO_REGISTER(proto_tcp); + +static void +tcp_drop_stats_cmd_cb(int fd, __attribute__((unused)) char *argv[], + __attribute__((unused)) int argc) { + uint32_t lcore_id; + struct lb_conn_table *ct; + uint64_t syn = 0; + uint64_t vip = 0; + uint64_t mp = 0; + uint64_t hash = 0; + uint64_t laddr = 0; + uint64_t sched = 0; + uint64_t rs = 0; + + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + ct = &lb_conn_tbls[lcore_id]; + syn += ct->drop_stats.syn; + vip += ct->drop_stats.vip; + mp += ct->drop_stats.mp; + hash += ct->drop_stats.hash; + laddr += ct->drop_stats.laddr; + sched += ct->drop_stats.sched; + rs += ct->drop_stats.rs; + } + unixctl_command_reply(fd, NORM_KV_64_FMT("syn", "\n"), syn); + unixctl_command_reply(fd, NORM_KV_64_FMT("vip", "\n"), vip); + unixctl_command_reply(fd, NORM_KV_64_FMT("mp", "\n"), mp); + unixctl_command_reply(fd, NORM_KV_64_FMT("hash", "\n"), hash); + unixctl_command_reply(fd, NORM_KV_64_FMT("laddr", "\n"), laddr); + unixctl_command_reply(fd, NORM_KV_64_FMT("sched", "\n"), sched); + unixctl_command_reply(fd, NORM_KV_64_FMT("rs", "\n"), rs); +} + +UNIXCTL_CMD_REGISTER("tcp/drop/stats", "", "Show TCP drop information.", 0, 0, + tcp_drop_stats_cmd_cb); + +static void +tcp_conn_dump_cmd_cb(int fd, __attribute__((unused)) char *argv[], + __attribute__((unused)) int argc) { + uint32_t lcore_id; + struct lb_conn_table *ct; + struct lb_conn *conn; + void *tmp; + + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + ct = &lb_conn_tbls[lcore_id]; + rte_spinlock_lock(&ct->spinlock); + for_each_conn_safe(conn, &ct->timeout_list, next, tmp) { + unixctl_command_reply( + fd, + "cip: " IPv4_BE_FMT ", cport: %u," + "vip: " IPv4_BE_FMT ", vport: %u," + "lip: " IPv4_BE_FMT ", lport: %u," + "rip: " IPv4_BE_FMT ", rport: %u," + "flags: 0x%x, state: %s, usetime:%u, timeout=%u\n", + IPv4_BE_ARG(conn->cip), rte_be_to_cpu_16(conn->cport), + IPv4_BE_ARG(conn->vip), rte_be_to_cpu_16(conn->vport), + IPv4_BE_ARG(conn->lip), rte_be_to_cpu_16(conn->lport), + IPv4_BE_ARG(conn->rip), rte_be_to_cpu_16(conn->rport), + conn->flags, tcp_conntrack_names[conn->state], conn->use_time, + conn->timeout); + } + rte_spinlock_unlock(&ct->spinlock); + } +} + +UNIXCTL_CMD_REGISTER("tcp/conn/dump", "", "Dump TCP connections.", 0, 0, + tcp_conn_dump_cmd_cb); + diff --git a/core/lb_proto_udp.c b/core/lb_proto_udp.c new file mode 100644 index 0000000..d6f4a5e --- /dev/null +++ b/core/lb_proto_udp.c @@ -0,0 +1,222 @@ +/* Copyright (c) 2018. TIG developer. */ + +#include +#include + +#include + +#include "lb_clock.h" +#include "lb_conn.h" +#include "lb_format.h" +#include "lb_proto.h" + +static struct lb_conn_table lb_conn_tbls[RTE_MAX_LCORE]; +static uint32_t udp_timeout = 30 * LB_CLOCK_HZ; + +static void +udp_set_conntrack_state(struct lb_conn *conn, __rte_unused struct udp_hdr *uh, + int dir) { + struct lb_real_service *rs = conn->real_service; + struct lb_virt_service *vs = rs->virt_service; + uint32_t lcore_id = rte_lcore_id(); + + if (dir == LB_DIR_ORIGINAL) { + if (!(conn->flags & LB_CONN_F_ACTIVE)) { + conn->flags |= LB_CONN_F_ACTIVE; + rte_atomic32_add(&rs->active_conns, 1); + rte_atomic32_add(&vs->active_conns, 1); + vs->stats[lcore_id].conns += 1; + rs->stats[lcore_id].conns += 1; + } + } else { + if (conn->flags & LB_CONN_F_ACTIVE) { + conn->flags &= ~LB_CONN_F_ACTIVE; + rte_atomic32_add(&rs->active_conns, -1); + rte_atomic32_add(&vs->active_conns, -1); + } + } +} + +static void +udp_set_packet_stats(struct lb_conn *conn, struct rte_mbuf *m, uint8_t dir) { + struct lb_real_service *rs = conn->real_service; + struct lb_virt_service *vs = rs->virt_service; + uint32_t cid = rte_lcore_id(); + + vs->stats[cid].bytes[dir] += m->pkt_len; + vs->stats[cid].packets[dir] += 1; + rs->stats[cid].bytes[dir] += m->pkt_len; + rs->stats[cid].packets[dir] += 1; +} + +static int +udp_conn_timer_expire_cb(struct lb_conn *conn, uint32_t ctime) { + if (ctime - conn->use_time > conn->timeout) + return 0; + else + return -1; +} + +static struct lb_conn * +udp_conn_schedule(struct lb_conn_table *ct, struct ipv4_hdr *iph, + struct udp_hdr *uh, uint16_t port_id) { + struct lb_virt_service *vs = NULL; + struct lb_real_service *rs = NULL; + struct lb_conn *conn = NULL; + + if ((vs = lb_vs_get(iph->dst_addr, uh->dst_port, iph->next_proto_id)) && + (rs = lb_vs_get_rs(vs, iph->src_addr, uh->src_port)) && + (conn = lb_conn_new(ct, iph->src_addr, uh->src_port, rs, 0, port_id))) { + lb_vs_put(vs); + return conn; + } + if (vs != NULL) { + lb_vs_put(vs); + } + if (rs != NULL) { + lb_vs_put_rs(rs); + } + + return NULL; +} + +static int +udp_fullnat_recv_client(struct rte_mbuf *m, struct ipv4_hdr *iph, + struct udp_hdr *uh, struct lb_conn_table *ct, + struct lb_conn *conn, uint16_t port_id) { + if (conn != NULL) { + lb_conn_expire(ct, conn); + conn = NULL; + } + + if (conn == NULL) { + conn = udp_conn_schedule(ct, iph, uh, port_id); + if (conn == NULL) { + rte_pktmbuf_free(m); + return 0; + } + } + + udp_set_conntrack_state(conn, uh, LB_DIR_ORIGINAL); + udp_set_packet_stats(conn, m, LB_DIR_ORIGINAL); + + iph->time_to_live = 63; + iph->src_addr = conn->lip; + iph->dst_addr = conn->rip; + uh->src_port = conn->lport; + uh->dst_port = conn->rport; + iph->hdr_checksum = 0; + iph->hdr_checksum = rte_ipv4_cksum(iph); + if (uh->dgram_cksum != 0) { + uh->dgram_cksum = 0; + uh->dgram_cksum = rte_ipv4_udptcp_cksum(iph, uh); + } + + return lb_device_output(m, iph, port_id); +} + +static int +udp_fullnat_recv_backend(struct rte_mbuf *m, struct ipv4_hdr *iph, + struct udp_hdr *uh, struct lb_conn *conn, + uint16_t port_id) { + udp_set_conntrack_state(conn, uh, LB_DIR_REPLY); + udp_set_packet_stats(conn, m, LB_DIR_REPLY); + + iph->time_to_live = 63; + iph->src_addr = conn->vip; + iph->dst_addr = conn->cip; + uh->src_port = conn->vport; + uh->dst_port = conn->cport; + iph->hdr_checksum = 0; + iph->hdr_checksum = rte_ipv4_cksum(iph); + if (uh->dgram_cksum != 0) { + uh->dgram_cksum = 0; + uh->dgram_cksum = rte_ipv4_udptcp_cksum(iph, uh); + } + + return lb_device_output(m, iph, port_id); +} + +static int +udp_fullnat_handle(struct rte_mbuf *m, struct ipv4_hdr *iph, uint16_t port_id) { + struct lb_conn_table *ct; + struct lb_conn *conn; + struct udp_hdr *uh; + uint8_t dir; + int rc; + + ct = &lb_conn_tbls[rte_lcore_id()]; + uh = UDP_HDR(iph); + + conn = lb_conn_find(ct, iph->src_addr, iph->dst_addr, uh->src_port, + uh->dst_port, &dir); + if (dir == LB_DIR_REPLY) + rc = udp_fullnat_recv_backend(m, iph, uh, conn, port_id); + else + rc = udp_fullnat_recv_client(m, iph, uh, ct, conn, port_id); + + return rc; +} + +static int +udp_fullnat_init(void) { + uint32_t lcore_id; + struct lb_conn_table *ct; + int rc; + + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + ct = &lb_conn_tbls[lcore_id]; + rc = lb_conn_table_init(ct, LB_IPPROTO_UDP, lcore_id, udp_timeout, NULL, + udp_conn_timer_expire_cb); + if (rc < 0) { + RTE_LOG(ERR, USER1, "%s(): lb_conn_table_init failed.\n", __func__); + return rc; + } + RTE_LOG(INFO, USER1, "%s(): Create udp connection table on lcore%u.\n", + __func__, lcore_id); + } + + return 0; +} + +static struct lb_proto proto_udp = { + .id = IPPROTO_UDP, + .type = LB_IPPROTO_UDP, + .init = udp_fullnat_init, + .fullnat_handle = udp_fullnat_handle, +}; + +LB_PROTO_REGISTER(proto_udp); + +static void +udp_conn_dump_cmd_cb(int fd, __attribute__((unused)) char *argv[], + __attribute__((unused)) int argc) { + uint32_t lcore_id; + struct lb_conn_table *ct; + struct lb_conn *conn; + void *tmp; + + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + ct = &lb_conn_tbls[lcore_id]; + rte_spinlock_lock(&ct->spinlock); + for_each_conn_safe(conn, &ct->timeout_list, next, tmp) { + unixctl_command_reply( + fd, + "cip: " IPv4_BE_FMT ", cport: %u," + "vip: " IPv4_BE_FMT ", vport: %u," + "lip: " IPv4_BE_FMT ", lport: %u," + "rip: " IPv4_BE_FMT ", rport: %u," + "flags: 0x%x, usetime:%u, timeout=%u\n", + IPv4_BE_ARG(conn->cip), rte_be_to_cpu_16(conn->cport), + IPv4_BE_ARG(conn->vip), rte_be_to_cpu_16(conn->vport), + IPv4_BE_ARG(conn->lip), rte_be_to_cpu_16(conn->lport), + IPv4_BE_ARG(conn->rip), rte_be_to_cpu_16(conn->rport), + conn->flags, conn->use_time, conn->timeout); + } + rte_spinlock_unlock(&ct->spinlock); + } +} + +UNIXCTL_CMD_REGISTER("udp/conn/dump", "", "Dump UDP connections.", 0, 0, + udp_conn_dump_cmd_cb); + diff --git a/core/lb_scheduler.c b/core/lb_scheduler.c new file mode 100644 index 0000000..4c9ec49 --- /dev/null +++ b/core/lb_scheduler.c @@ -0,0 +1,429 @@ +/* Copyright (c) 2018. TIG developer. */ + +#include +#include + +#include + +#include +#include + +#include "conhash.h" +#include "lb_format.h" +#include "lb_scheduler.h" +#include "lb_service.h" + +//#define SCHED_DEBUG +#ifdef SCHED_DEBUG +#define SCHED_PRINT(...) \ + do { \ + fprintf(stderr, __VA_ARGS__); \ + } while (0) +#else +#define SCHED_PRINT(...) \ + do { \ + } while (0) +#endif + +#define MAX_RS_REPLICA 256 + +static int +conhash_sched_init(struct lb_virt_service *vs) { + vs->sched_data = conhash_init(NULL); + return vs->sched_data != NULL ? 0 : -1; +} + +static void +conhash_sched_fini(struct lb_virt_service *vs) { + conhash_fini(vs->sched_data); +} + +#define IP_PORT_TO_STR(ip, port, s) \ + do { \ + s[0] = (unsigned char)((ip) >> 24 & 0xff); \ + s[1] = (unsigned char)((ip) >> 16 & 0xff); \ + s[2] = (unsigned char)((ip) >> 8 & 0xff); \ + s[3] = (unsigned char)((ip)&0xff); \ + s[4] = (unsigned char)((port) >> 8 & 0xff); \ + s[5] = (unsigned char)((port)&0xff); \ + s[6] = '\0'; \ + } while (0) + +static int +conhash_sched_add(struct lb_virt_service *vs, struct lb_real_service *rs) { + struct conhash_s *conhash = vs->sched_data; + struct node_s *node; + char buf[8]; + + if (unlikely(conhash == NULL)) + return -1; + node = rte_zmalloc_socket(NULL, sizeof(struct node_s), RTE_CACHE_LINE_SIZE, + vs->socket_id); + if (node != NULL) { + IP_PORT_TO_STR(rs->rip, rs->rport, buf); + conhash_set_node(node, buf, MAX_RS_REPLICA, rs); + conhash_add_node(conhash, node); + } + return (rs->sched_node = node) != NULL ? 0 : -1; +} + +static int +conhash_sched_del(struct lb_virt_service *vs, struct lb_real_service *rs) { + struct conhash_s *conhash = vs->sched_data; + + if (unlikely(conhash == NULL)) + return -1; + conhash_del_node(conhash, rs->sched_node); + rte_free(rs->sched_node); + rs->sched_node = NULL; + return 0; +} + +static int +conhash_sched_update(__rte_unused struct lb_virt_service *vs, + __rte_unused struct lb_real_service *rs) { + return 0; +} + +#define IP_PORT_TO_UINT64(ip, port) \ + (((uint64_t)(ip) << 32) | ((uint64_t)(port) << 16)) + +static struct lb_real_service * +conhash_schedule_ipport(struct lb_virt_service *vs, uint32_t ip, + uint16_t port) { + struct conhash_s *conhash = vs->sched_data; + uint64_t key; + struct node_s *node; + + if (unlikely(conhash == NULL)) + return NULL; + key = IP_PORT_TO_UINT64(ip, port); + node = conhash_lookup(conhash, (const char *)&key, sizeof(uint64_t)); + return node != NULL ? node->userdata : NULL; +} + +static struct lb_real_service * +conhash_schedule_iponly(struct lb_virt_service *vs, uint32_t ip, + __rte_unused uint16_t port) { + struct conhash_s *conhash = vs->sched_data; + struct node_s *node; + + if (unlikely(conhash == NULL)) + return NULL; + node = conhash_lookup(conhash, (const char *)&ip, sizeof(uint32_t)); + return node != NULL ? node->userdata : NULL; +} + +struct rr_data { + struct lb_real_service *real_services[RTE_MAX_LCORE]; +} __rte_cache_aligned; + +static int +rr_sched_init(struct lb_virt_service *vs) { + struct rr_data *rr; + + rr = rte_zmalloc_socket(NULL, sizeof(struct rr_data), RTE_CACHE_LINE_SIZE, + vs->socket_id); + return (vs->sched_data = rr) != NULL ? 0 : -1; +} + +static void +rr_sched_fini(struct lb_virt_service *vs) { + rte_free(vs->sched_data); +} + +static int +rr_sched_add(struct lb_virt_service *vs, + __rte_unused struct lb_real_service *rs) { + struct rr_data *rr = vs->sched_data; + uint32_t lcore_id; + + if (unlikely(rr == NULL)) + return -1; + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + rr->real_services[lcore_id] = LIST_FIRST(&vs->real_services); + } + return 0; +} + +static int +rr_sched_del(struct lb_virt_service *vs, + __rte_unused struct lb_real_service *rs) { + struct rr_data *rr = vs->sched_data; + uint32_t lcore_id; + + if (unlikely(rr == NULL)) + return -1; + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + rr->real_services[lcore_id] = LIST_FIRST(&vs->real_services); + } + return 0; +} + +static int +rr_sched_update(__rte_unused struct lb_virt_service *vs, + __rte_unused struct lb_real_service *rs) { + return 0; +} + +static struct lb_real_service * +rr_schedule(struct lb_virt_service *vs, __rte_unused uint32_t ip, + __rte_unused uint16_t port) { + uint32_t lcore_id = rte_lcore_id(); + struct rr_data *rr = vs->sched_data; + struct lb_real_service *rs, *p; + + if (unlikely(rr == NULL)) + return NULL; + + rs = rr->real_services[lcore_id]; + if (rs == NULL) + return NULL; + p = rs; + + do { + if (rs->flags & LB_RS_F_AVAILABLE) + goto hit; + rs = LIST_NEXT(rs, next); + if (rs == NULL) + rs = LIST_FIRST(&vs->real_services); + } while (rs != p); + + return NULL; + +hit: + SCHED_PRINT( + "RR: lcore%u, vip=" IPv4_BE_FMT ", vport=%u, proto=%u, rip=" IPv4_BE_FMT + ", rport=%u, weight=%u\n", + lcore_id, IPv4_BE_ARG(vs->vip), rte_be_to_cpu_16(vs->vport), vs->proto, + IPv4_BE_ARG(rs->rip), rte_be_to_cpu_16(rs->rport), rs->weight); + p = LIST_NEXT(rs, next); + if (p == NULL) + p = LIST_FIRST(&vs->real_services); + rr->real_services[lcore_id] = p; + return rs; +} + +struct wrr_data { + struct { + struct lb_real_service *real_service; + int cw; + } __rte_cache_aligned cores[RTE_MAX_LCORE]; + int mw; + int dw; +}; + +static int +wrr_sched_init(struct lb_virt_service *vs) { + struct wrr_data *wrr; + + wrr = rte_zmalloc_socket(NULL, sizeof(struct wrr_data), RTE_CACHE_LINE_SIZE, + vs->socket_id); + return (vs->sched_data = wrr) != NULL ? 0 : -1; +} + +static void +wrr_sched_fini(struct lb_virt_service *vs) { + rte_free(vs->sched_data); +} + +static int +wrr_max_weight(struct lb_virt_service *vs) { + struct lb_real_service *rs; + int max = 0; + + LIST_FOREACH(rs, &vs->real_services, next) { + if (!(rs->flags & LB_RS_F_AVAILABLE)) + continue; + if (max < rs->weight) + max = rs->weight; + } + return max; +} + +static int +gcd(int a, int b) { + int c; + + while ((c = a % b)) { + a = b; + b = c; + } + return b; +} + +static int +wrr_gcd_weight(struct lb_virt_service *vs) { + struct lb_real_service *rs; + int g = 0; + + LIST_FOREACH(rs, &vs->real_services, next) { + if (!(rs->flags & LB_RS_F_AVAILABLE)) + continue; + if (rs->weight == 0) + continue; + if (g == 0) + g = rs->weight; + else + g = gcd(g, rs->weight); + } + return g ? g : 1; +} + +static void +wrr_update_weight(struct lb_virt_service *vs) { + struct wrr_data *wrr = vs->sched_data; + struct lb_real_service *real_service; + uint32_t lcore_id; + int weight = 0; + + if (unlikely(wrr == NULL)) + return; + + wrr->mw = wrr_max_weight(vs); + wrr->dw = wrr_gcd_weight(vs); + LIST_FOREACH(real_service, &vs->real_services, next) { + if ((real_service->flags & LB_RS_F_AVAILABLE) && + (real_service->weight != 0)) { + weight = real_service->weight; + break; + } + } + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + wrr->cores[lcore_id].real_service = real_service; + wrr->cores[lcore_id].cw = weight; + } +} + +static int +wrr_sched_add(struct lb_virt_service *vs, + __rte_unused struct lb_real_service *rs) { + wrr_update_weight(vs); + return 0; +} + +static int +wrr_sched_del(struct lb_virt_service *vs, + __rte_unused struct lb_real_service *rs) { + wrr_update_weight(vs); + return 0; +} + +static int +wrr_sched_update(struct lb_virt_service *vs, + __rte_unused struct lb_real_service *rs) { + wrr_update_weight(vs); + return 0; +} + +static struct lb_real_service * +wrr_schedule(struct lb_virt_service *vs, __rte_unused uint32_t ip, + __rte_unused uint16_t port) { + uint32_t lcore_id = rte_lcore_id(); + struct wrr_data *wrr = vs->sched_data; + struct lb_real_service *rs, *p; + int cw; + + if (unlikely(wrr == NULL)) + return NULL; + + cw = wrr->cores[lcore_id].cw; + rs = wrr->cores[lcore_id].real_service; + if (rs == NULL) + return NULL; + p = rs; + + cw -= wrr->dw; + if (cw >= 0) + goto hit; + + do { + rs = LIST_NEXT(rs, next); + if (rs == NULL) + rs = LIST_FIRST(&vs->real_services); + if (rs->flags & LB_RS_F_AVAILABLE) { + cw = rs->weight; + cw -= wrr->dw; + if (cw >= 0) + goto hit; + } + } while (rs != p); + + return NULL; + +hit: + SCHED_PRINT( + "WRR: lcore%u, vip=" IPv4_BE_FMT + ", vport=%u, proto=%u, rip=" IPv4_BE_FMT ", rport=%u, weight=%u\n", + lcore_id, IPv4_BE_ARG(vs->vip), rte_be_to_cpu_16(vs->vport), vs->proto, + IPv4_BE_ARG(rs->rip), rte_be_to_cpu_16(rs->rport), rs->weight); + wrr->cores[lcore_id].cw = cw; + wrr->cores[lcore_id].real_service = rs; + return rs; +} + +enum sched_type { + LB_SCHED_T_IPPORT, + LB_SCHED_T_IPONLY, + LB_SCHED_T_RR, + LB_SCHED_T_WRR, + LB_SCHED_T_NONE, +}; + +static const struct lb_scheduler schedulers[LB_SCHED_T_NONE] = { + [LB_SCHED_T_IPPORT] = + { + .name = "ipport", + .init = conhash_sched_init, + .fini = conhash_sched_fini, + .add = conhash_sched_add, + .del = conhash_sched_del, + .update = conhash_sched_update, + .dispatch = conhash_schedule_ipport, + }, + [LB_SCHED_T_IPONLY] = + { + .name = "iponly", + .init = conhash_sched_init, + .fini = conhash_sched_fini, + .add = conhash_sched_add, + .del = conhash_sched_del, + .update = conhash_sched_update, + .dispatch = conhash_schedule_iponly, + }, + [LB_SCHED_T_RR] = + { + .name = "rr", + .init = rr_sched_init, + .fini = rr_sched_fini, + .add = rr_sched_add, + .del = rr_sched_del, + .update = rr_sched_update, + .dispatch = rr_schedule, + }, + [LB_SCHED_T_WRR] = + { + .name = "wrr", + .init = wrr_sched_init, + .fini = wrr_sched_fini, + .add = wrr_sched_add, + .del = wrr_sched_del, + .update = wrr_sched_update, + .dispatch = wrr_schedule, + }, +}; + +int +lb_scheduler_lookup_by_name(const char *name, + const struct lb_scheduler **sched) { + int i; + + for (i = 0; i < LB_SCHED_T_NONE; i++) { + if (strcasecmp(name, schedulers[i].name) == 0) { + *sched = &schedulers[i]; + return 0; + } + } + return -1; +} + diff --git a/core/lb_scheduler.h b/core/lb_scheduler.h new file mode 100644 index 0000000..1bc38eb --- /dev/null +++ b/core/lb_scheduler.h @@ -0,0 +1,24 @@ +/* Copyright (c) 2018. TIG developer. */ + +#ifndef __LB_SCHEDULER_H__ +#define __LB_SCHEDULER_H__ + +struct lb_real_service; +struct lb_virt_service; + +struct lb_scheduler { + const char *name; + int (*init)(struct lb_virt_service *); + void (*fini)(struct lb_virt_service *); + int (*add)(struct lb_virt_service *, struct lb_real_service *); + int (*del)(struct lb_virt_service *, struct lb_real_service *); + int (*update)(struct lb_virt_service *, struct lb_real_service *); + struct lb_real_service *(*dispatch)(struct lb_virt_service *, uint32_t, + uint16_t); +}; + +int lb_scheduler_lookup_by_name(const char *name, + const struct lb_scheduler **sched); + +#endif + diff --git a/core/lb_service.c b/core/lb_service.c new file mode 100644 index 0000000..f3be839 --- /dev/null +++ b/core/lb_service.c @@ -0,0 +1,1658 @@ +/* Copyright (c) 2018. TIG developer. */ + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "lb_clock.h" +#include "lb_device.h" +#include "lb_format.h" +#include "lb_parser.h" +#include "lb_scheduler.h" +#include "lb_service.h" + +#define virt_service_key(ip, port, proto) \ + (((uint64_t)(ip) << 32) | ((uint64_t)(port) << 16) | (uint64_t)(proto)) + +struct lb_vs_table { + struct rte_hash *vs_htbl; + struct rte_hash *vip_htbl; + rte_rwlock_t rwlock; +} __rte_cache_aligned; + +#define LB_VS_TBL_WLOCK(t) rte_rwlock_write_lock(&(t)->rwlock) +#define LB_VS_TBL_WUNLOCK(t) rte_rwlock_write_unlock(&(t)->rwlock) +#define LB_VS_TBL_RLOCK(t) rte_rwlock_read_lock(&(t)->rwlock) +#define LB_VS_TBL_RUNLOCK(t) rte_rwlock_read_unlock(&(t)->rwlock) + +#define LB_VS_WLOCK(t) rte_rwlock_write_lock(&(t)->rwlock) +#define LB_VS_WUNLOCK(t) rte_rwlock_write_unlock(&(t)->rwlock) +#define LB_VS_RLOCK(t) rte_rwlock_read_lock(&(t)->rwlock) +#define LB_VS_RUNLOCK(t) rte_rwlock_read_unlock(&(t)->rwlock) + +static struct lb_vs_table *lb_vs_tbls[RTE_MAX_NUMA_NODES]; + +static inline uint32_t +vs_tbl_get_next(int sid) { + sid++; + while (sid < RTE_MAX_NUMA_NODES) { + if (lb_vs_tbls[sid] == NULL) { + sid++; + continue; + } + break; + } + return sid; +} + +#define VS_TBL_FOREACH_SOCKET(socket_id) \ + for (socket_id = vs_tbl_get_next(-1); socket_id < RTE_MAX_NUMA_NODES; \ + socket_id = vs_tbl_get_next(socket_id)) + +int +lb_service_init(void) { + uint32_t port_id, nb_ports; + uint32_t socket_id; + char name[RTE_HASH_NAMESIZE]; + struct rte_hash_parameters param; + struct lb_vs_table *t; + + nb_ports = rte_eth_dev_count(); + for (port_id = 0; port_id < nb_ports; port_id++) { + socket_id = rte_eth_dev_socket_id(port_id); + + if (lb_vs_tbls[socket_id] != NULL) + continue; + t = rte_zmalloc_socket("lb_vs_table", sizeof(*t), RTE_CACHE_LINE_SIZE, + socket_id); + if (t == NULL) { + RTE_LOG(ERR, USER1, "%s(): Not enough memory.", __func__); + return -1; + } + + memset(¶m, 0, sizeof(param)); + snprintf(name, sizeof(name), "vs_htbl%u", socket_id); + param.name = name; + param.entries = LB_MAX_VS; + param.key_len = sizeof(uint64_t); + param.socket_id = socket_id; + param.hash_func = rte_hash_crc; + + t->vs_htbl = rte_hash_create(¶m); + if (t->vs_htbl == NULL) { + RTE_LOG(ERR, USER1, "%s(): Create hash table %s failed, %s.", + __func__, name, rte_strerror(rte_errno)); + return -1; + } + + memset(¶m, 0, sizeof(param)); + snprintf(name, sizeof(name), "vip_htbl%u", socket_id); + param.name = name; + param.entries = LB_MAX_VS; + param.key_len = sizeof(uint32_t); + param.socket_id = socket_id; + param.hash_func = rte_hash_crc; + + t->vip_htbl = rte_hash_create(¶m); + if (t->vip_htbl == NULL) { + RTE_LOG(ERR, USER1, "%s(): Create hash table %s failed, %s.", + __func__, name, rte_strerror(rte_errno)); + return -1; + } + + rte_rwlock_init(&t->rwlock); + + lb_vs_tbls[socket_id] = t; + } + + return 0; +} + +int +lb_is_vip_exist(uint32_t vip) { + struct lb_vs_table *t; + + t = lb_vs_tbls[rte_socket_id()]; + return rte_hash_lookup(t->vip_htbl, &vip) >= 0; +} + +static struct lb_virt_service * +vs_tbl_find(struct lb_vs_table *t, uint32_t vip, uint16_t vport, + uint8_t proto) { + struct lb_virt_service *vs = NULL; + uint64_t key; + + key = virt_service_key(vip, vport, proto); + rte_hash_lookup_data(t->vs_htbl, &key, (void **)&vs); + + return vs; +} + +static int +vs_tbl_add(struct lb_vs_table *t, struct lb_virt_service *vs) { + uint64_t key; + int rc; + void *p; + uint32_t count = 0; + + key = virt_service_key(vs->vip, vs->vport, vs->proto); + rc = rte_hash_add_key_data(t->vs_htbl, &key, vs); + if (rc < 0) { + return rc; + } + + rc = rte_hash_lookup_data(t->vip_htbl, &vs->vip, &p); + if (rc == 0) { + count = (uint32_t)(uintptr_t)p; + } + count += 1; + + rc = rte_hash_add_key_data(t->vip_htbl, &vs->vip, (void *)(uintptr_t)count); + if (unlikely(rc < 0)) { + rte_hash_del_key(t->vs_htbl, &key); + } + + return rc; +} + +static void +vs_tbl_del(struct lb_vs_table *t, struct lb_virt_service *vs) { + uint64_t key; + int rc; + void *p; + uint32_t count; + + key = virt_service_key(vs->vip, vs->vport, vs->proto); + rc = rte_hash_del_key(t->vs_htbl, &key); + if (rc < 0) { + return; + } + + rc = rte_hash_lookup_data(t->vip_htbl, &vs->vip, &p); + if (unlikely(rc < 0)) { + return; + } + + count = (uint32_t)(uintptr_t)p; + count -= 1; + if (count == 0) { + rte_hash_del_key(t->vs_htbl, &key); + } else { + rte_hash_add_key_data(t->vip_htbl, &vs->vip, (void *)(uintptr_t)count); + } +} + +static struct lb_real_service * +vs_find_rs(struct lb_virt_service *vs, uint32_t rip, uint16_t rport) { + struct lb_real_service *rs; + + LIST_FOREACH(rs, &vs->real_services, next) { + if (rs->rip == rip && rs->rport == rport) + return rs; + } + return NULL; +} + +static void +lb_rs_list_insert_by_weight(struct lb_virt_service *vs, + struct lb_real_service *rs) { + struct lb_real_service *real_service; + + if (LIST_EMPTY(&vs->real_services)) { + LIST_INSERT_HEAD(&vs->real_services, rs, next); + return; + } + + LIST_FOREACH(real_service, &vs->real_services, next) { + if (LIST_NEXT(real_service, next) == NULL) + break; + if (real_service->weight <= rs->weight) + break; + } + + if (real_service->weight <= rs->weight) + LIST_INSERT_BEFORE(real_service, rs, next); + else + LIST_INSERT_AFTER(real_service, rs, next); +} + +static void +lb_rs_list_update_by_weight(struct lb_virt_service *vs, + struct lb_real_service *rs) { + LIST_REMOVE(rs, next); + lb_rs_list_insert_by_weight(vs, rs); +} + +struct lb_virt_service * +lb_vs_get(uint32_t vip, uint16_t vport, uint8_t proto) { + uint32_t socket_id = rte_socket_id(); + struct lb_virt_service *vs; + + LB_VS_TBL_RLOCK(lb_vs_tbls[socket_id]); + vs = vs_tbl_find(lb_vs_tbls[socket_id], vip, vport, proto); + if (vs != NULL) { + rte_atomic32_add(&vs->refcnt, 1); + } + LB_VS_TBL_RUNLOCK(lb_vs_tbls[socket_id]); + + return vs; +} + +void +lb_vs_put(struct lb_virt_service *vs) { + lb_vs_free(vs); +} + +struct lb_real_service * +lb_vs_get_rs(struct lb_virt_service *vs, uint32_t cip, uint16_t cport) { + struct lb_real_service *rs; + + LB_VS_RLOCK(vs); + rs = vs->sched->dispatch(vs, cip, cport); + if (rs != NULL) { + rte_atomic32_add(&rs->refcnt, 1); + } + LB_VS_RUNLOCK(vs); + + return rs; +} + +void +lb_vs_put_rs(struct lb_real_service *rs) { + lb_rs_free(rs); +} + +static struct lb_virt_service * +lb_vs_alloc(uint32_t vip, uint16_t vport, uint8_t proto, + const struct lb_scheduler *sched, uint32_t socket_id) { + struct lb_virt_service *vs; + + vs = rte_zmalloc_socket("vs", sizeof(*vs), RTE_CACHE_LINE_SIZE, socket_id); + if (vs == NULL) + return NULL; + + if (sched->init && sched->init(vs) < 0) { + rte_free(vs); + return NULL; + } + + vs->vip = vip; + vs->vport = vport; + vs->proto = proto; + vs->sched = sched; + vs->max_conns = INT32_MAX; + vs->socket_id = socket_id; + rte_atomic32_set(&vs->refcnt, 1); + + return vs; +} + +void +lb_vs_free(struct lb_virt_service *vs) { + if (vs == NULL) + return; + + if (rte_atomic32_add_return(&vs->refcnt, -1) != 0) + return; + if (vs->sched->fini) + vs->sched->fini(vs); + rte_free(vs); +} + +static struct lb_real_service * +lb_rs_alloc(uint32_t rip, uint32_t rport, int weight, + struct lb_virt_service *vs) { + struct lb_real_service *rs; + + rs = rte_zmalloc_socket("rs", sizeof(*rs), RTE_CACHE_LINE_SIZE, + vs->socket_id); + if (rs == NULL) + return NULL; + + rs->rip = rip; + rs->rport = rport; + rs->proto = vs->proto; + rs->weight = weight; + rs->virt_service = vs; + rte_atomic32_add(&vs->refcnt, 1); + rte_atomic32_set(&rs->refcnt, 1); + + return rs; +} + +void +lb_rs_free(struct lb_real_service *rs) { + if (rs == NULL) + return; + if (rte_atomic32_add_return(&rs->refcnt, -1) != 0) + return; + lb_vs_free(rs->virt_service); + rte_free(rs); +} + +static void +vs_del_all_rs(struct lb_virt_service *vs) { + struct lb_real_service *rs; + + while ((rs = LIST_FIRST(&vs->real_services)) != NULL) { + LIST_REMOVE(rs, next); + vs->sched->del(vs, rs); + lb_rs_free(rs); + } +} + +/* UNIXCTL COMMAND */ + +static int +vs_add_arg_parse(char *argv[], __attribute((unused)) int argc, uint32_t *vip, + uint16_t *vport, uint8_t *proto, + const struct lb_scheduler **sched) { + int i = 0; + int rc; + + /* ip:port */ + rc = parse_ipv4_port(argv[i++], vip, vport); + if (rc < 0) { + return i - 1; + } + + /* proto */ + rc = parse_l4_proto(argv[i++], proto); + if (rc < 0) { + return i - 1; + } + + /* scheduler */ + rc = lb_scheduler_lookup_by_name(argv[i++], sched); + if (rc < 0) { + return i - 1; + } + + return i; +} + +static void +vs_add_cmd_cb(int fd, char *argv[], int argc) { + uint32_t vip; + uint16_t vport; + uint8_t proto; + const struct lb_scheduler *sched; + int rc; + struct lb_virt_service *vss[RTE_MAX_NUMA_NODES] = {0}; + uint32_t socket_id; + + memset(vss, 0, sizeof(vss)); + + rc = vs_add_arg_parse(argv, argc, &vip, &vport, &proto, &sched); + if (rc != argc) { + unixctl_command_reply_error(fd, "Invalid parameter: %s.\n", argv[rc]); + return; + } + + VS_TBL_FOREACH_SOCKET(socket_id) { + if (vs_tbl_find(lb_vs_tbls[socket_id], vip, vport, proto) != NULL) { + unixctl_command_reply_error(fd, "Virt service already exists.\n"); + goto free_vss; + } + + vss[socket_id] = lb_vs_alloc(vip, vport, proto, sched, socket_id); + if (vss[socket_id] == NULL) { + unixctl_command_reply_error(fd, "Not enough memory.\n"); + goto free_vss; + } + } + + VS_TBL_FOREACH_SOCKET(socket_id) { + LB_VS_TBL_WLOCK(lb_vs_tbls[socket_id]); + rc = vs_tbl_add(lb_vs_tbls[socket_id], vss[socket_id]); + LB_VS_TBL_WUNLOCK(lb_vs_tbls[socket_id]); + if (rc < 0) { + unixctl_command_reply_error(fd, "No space in the table.\n"); + goto del_vss; + } + } + + return; + +del_vss: + VS_TBL_FOREACH_SOCKET(socket_id) { + LB_VS_TBL_WLOCK(lb_vs_tbls[socket_id]); + vs_tbl_del(lb_vs_tbls[socket_id], vss[socket_id]); + LB_VS_TBL_WUNLOCK(lb_vs_tbls[socket_id]); + } + +free_vss: + VS_TBL_FOREACH_SOCKET(socket_id) { lb_vs_free(vss[socket_id]); } +} + +UNIXCTL_CMD_REGISTER("vs/add", "VIP:VPORT tcp|udp ipport|iponly|rr|wrr.", + "Add virtual service.", 3, 3, vs_add_cmd_cb); + +static int +vs_del_arg_parse(char *argv[], __attribute((unused)) int argc, uint32_t *vip, + uint16_t *vport, uint8_t *proto) { + int rc; + int i = 0; + + /* ip:port */ + rc = parse_ipv4_port(argv[i++], vip, vport); + if (rc < 0) { + return i - 1; + } + + /* proto */ + rc = parse_l4_proto(argv[i++], proto); + if (rc < 0) { + return i - 1; + } + + return i; +} + +static void +vs_del_cmd_cb(int fd, char *argv[], int argc) { + uint32_t vip; + uint16_t vport; + uint8_t proto; + int rc; + uint32_t socket_id; + struct lb_virt_service *vs; + + rc = vs_del_arg_parse(argv, argc, &vip, &vport, &proto); + if (rc != argc) { + unixctl_command_reply_error(fd, "Invalid parameter: %s.\n", argv[rc]); + return; + } + + VS_TBL_FOREACH_SOCKET(socket_id) { + vs = vs_tbl_find(lb_vs_tbls[socket_id], vip, vport, proto); + if (vs != NULL) { + LB_VS_WLOCK(vs); + vs_del_all_rs(vs); + LB_VS_WUNLOCK(vs); + + LB_VS_TBL_WLOCK(lb_vs_tbls[socket_id]); + vs_tbl_del(lb_vs_tbls[socket_id], vs); + LB_VS_TBL_WUNLOCK(lb_vs_tbls[socket_id]); + + lb_vs_free(vs); + } + } +} + +UNIXCTL_CMD_REGISTER("vs/del", "VIP:VPORT tcp|udp.", "Delete virtual service.", + 2, 2, vs_del_cmd_cb); + +static inline const char * +l4proto_format(uint8_t l4proto) { + if (l4proto == IPPROTO_TCP) + return "tcp"; + if (l4proto == IPPROTO_UDP) + return "udp"; + return "oth"; +} + +static int +vs_list_arg_parse(char *argv[], int argc, int *json_fmt) { + int i = 0; + int rc; + + if (i < argc) { + rc = strcmp(argv[i++], "--json"); + if (rc != 0) + return i - 1; + *json_fmt = 1; + } else { + *json_fmt = 0; + } + + return i; +} + +static void +vs_list_cmd_cb(int fd, char *argv[], int argc) { + int json_fmt, json_first_obj = 1; + int rc; + uint32_t socket_id; + struct lb_vs_table *t = NULL; + const void *key; + uint32_t next = 0; + struct lb_virt_service *vs; + char buf[32]; + + rc = vs_list_arg_parse(argv, argc, &json_fmt); + if (rc != argc) { + unixctl_command_reply_error(fd, "Invalid parameter: %s.\n", argv[rc]); + return; + } + + VS_TBL_FOREACH_SOCKET(socket_id) { + t = lb_vs_tbls[socket_id]; + + unixctl_command_reply(fd, json_fmt ? "[" + : "IP Port " + "Type Sched Max_conns\n"); + while (rte_hash_iterate(t->vs_htbl, &key, (void **)&vs, &next) >= 0) { + ipv4_addr_tostring(vs->vip, buf, sizeof(buf)); + + if (json_fmt) { + unixctl_command_reply(fd, json_first_obj ? "{" : ",{"); + json_first_obj = 0; + unixctl_command_reply(fd, JSON_KV_S_FMT("ip", ","), buf); + unixctl_command_reply(fd, JSON_KV_32_FMT("port", ","), + rte_be_to_cpu_16(vs->vport)); + unixctl_command_reply(fd, JSON_KV_S_FMT("type", ","), + l4proto_format(vs->proto)); + unixctl_command_reply(fd, JSON_KV_S_FMT("sched", ","), + vs->sched->name); + unixctl_command_reply(fd, JSON_KV_32_FMT("max_conns", "}"), + vs->max_conns); + } else { + unixctl_command_reply(fd, "%-15s %-5u %-5s %-10s %d\n", buf, + rte_be_to_cpu_16(vs->vport), + l4proto_format(vs->proto), + vs->sched->name, vs->max_conns); + } + } + if (json_fmt) + unixctl_command_reply(fd, "]\n"); + + break; + } +} +UNIXCTL_CMD_REGISTER("vs/list", "[--json].", "List all virtual services.", 0, 1, + vs_list_cmd_cb); + +static int +vs_synproxy_arg_parse(char *argv[], int argc, uint32_t *vip, uint16_t *vport, + uint8_t *proto, uint8_t *echo, uint8_t *op) { + int rc; + int i = 0; + + /* ip:port */ + rc = parse_ipv4_port(argv[i++], vip, vport); + if (rc < 0) + return i - 1; + + /* proto */ + rc = parse_l4_proto(argv[i++], proto); + if (rc < 0 || *proto != IPPROTO_TCP) + return i - 1; + + if (i < argc) { + *echo = 0; + rc = parser_read_uint8(op, argv[i++]); + if (rc < 0) + return i - 1; + } else { + *echo = 1; + } + + return i; +} + +static void +vs_synproxy_cmd_cb(int fd, char *argv[], int argc) { + uint32_t vip; + uint16_t vport; + uint8_t proto; + uint8_t echo = 0; + uint8_t op; + int rc; + struct lb_virt_service *vs; + uint32_t socket_id; + + rc = vs_synproxy_arg_parse(argv, argc, &vip, &vport, &proto, &echo, &op); + if (rc != argc) { + unixctl_command_reply_error(fd, "Invalid parameter: %s.\n", argv[rc]); + return; + } + + VS_TBL_FOREACH_SOCKET(socket_id) { + vs = vs_tbl_find(lb_vs_tbls[socket_id], vip, vport, proto); + if (vs == NULL) { + unixctl_command_reply_error(fd, "Cannot find virt service.\n"); + return; + } + if (echo) { + unixctl_command_reply(fd, "%u\n", !!(vs->flags & LB_VS_F_SYNPROXY)); + return; + } + + if (op) { + vs->flags |= LB_VS_F_SYNPROXY; + } else { + vs->flags &= ~LB_VS_F_SYNPROXY; + } + } + + return; +} + +UNIXCTL_CMD_REGISTER("vs/synproxy", "VIP:VPORT tcp [0|1].", + "Show or set synproxy.", 2, 3, vs_synproxy_cmd_cb); + +static int +vs_toa_arg_parse(char *argv[], int argc, uint32_t *vip, uint16_t *vport, + uint8_t *proto, uint8_t *echo, uint8_t *op) { + int rc; + int i = 0; + + /* ip:port */ + rc = parse_ipv4_port(argv[i++], vip, vport); + if (rc < 0) + return i - 1; + + /* proto */ + rc = parse_l4_proto(argv[i++], proto); + if (rc < 0 || *proto != IPPROTO_TCP) + return i - 1; + + if (i < argc) { + *echo = 0; + rc = parser_read_uint8(op, argv[i++]); + if (rc < 0) + return i - 1; + } else { + *echo = 1; + } + + return i; +} + +static void +vs_toa_cmd_cb(int fd, char *argv[], int argc) { + uint32_t vip; + uint16_t vport; + uint8_t proto; + uint8_t echo = 0; + uint8_t op; + int rc; + struct lb_virt_service *vs; + uint32_t socket_id; + + rc = vs_toa_arg_parse(argv, argc, &vip, &vport, &proto, &echo, &op); + if (rc != argc) { + unixctl_command_reply_error(fd, "Invalid parameter: %s.\n", argv[rc]); + return; + } + + VS_TBL_FOREACH_SOCKET(socket_id) { + vs = vs_tbl_find(lb_vs_tbls[socket_id], vip, vport, proto); + if (vs == NULL) { + unixctl_command_reply_error(fd, "Cannot find virt service.\n"); + return; + } + if (echo) { + unixctl_command_reply(fd, "%u\n", !!(vs->flags & LB_VS_F_TOA)); + return; + } + + if (op) { + vs->flags |= LB_VS_F_TOA; + } else { + vs->flags &= ~LB_VS_F_TOA; + } + } + + return; +} + +UNIXCTL_CMD_REGISTER("vs/toa", "VIP:VPORT tcp [0|1].", "Show or set toa.", 2, 3, + vs_toa_cmd_cb); + +static int +vs_max_conn_arg_parse(char *argv[], int argc, uint32_t *vip, uint16_t *vport, + uint8_t *proto, uint8_t *echo, int *max) { + int rc; + int i = 0; + + /* ip:port */ + rc = parse_ipv4_port(argv[i++], vip, vport); + if (rc < 0) + return i - 1; + + /* proto */ + rc = parse_l4_proto(argv[i++], proto); + if (rc < 0) + return i - 1; + + if (i < argc) { + *echo = 0; + rc = parser_read_int32(max, argv[i++]); + if (rc < 0 || *max < 0) + return i - 1; + } else { + *echo = 1; + } + + return i; +} + +static void +vs_max_conn_cmd_cb(int fd, char *argv[], int argc) { + uint32_t vip; + uint16_t vport; + uint8_t proto; + uint8_t echo = 0; + int max; + int rc; + struct lb_virt_service *vs; + uint32_t socket_id; + + rc = vs_max_conn_arg_parse(argv, argc, &vip, &vport, &proto, &echo, &max); + if (rc != argc) { + unixctl_command_reply_error(fd, "Invalid parameter: %s.\n", argv[rc]); + return; + } + + VS_TBL_FOREACH_SOCKET(socket_id) { + vs = vs_tbl_find(lb_vs_tbls[socket_id], vip, vport, proto); + if (vs == NULL) { + unixctl_command_reply_error(fd, "Cannot find virt service.\n"); + return; + } + if (echo) { + unixctl_command_reply(fd, "%d\n", vs->max_conns); + return; + } + + vs->max_conns = max; + } + + return; +} + +UNIXCTL_CMD_REGISTER("vs/max_conns", "VIP:VPORT tcp [0|1].", + "Show or set max_conns.", 2, 3, vs_max_conn_cmd_cb); + +static int +vs_est_timeout_arg_parse(char *argv[], int argc, uint32_t *vip, uint16_t *vport, + uint8_t *proto, uint8_t *echo, uint32_t *timeout) { + int rc; + int i = 0; + + /* ip:port */ + rc = parse_ipv4_port(argv[i++], vip, vport); + if (rc < 0) + return i - 1; + + /* proto */ + rc = parse_l4_proto(argv[i++], proto); + if (rc < 0) + return i - 1; + + if (i < argc) { + *echo = 0; + rc = parser_read_uint32(timeout, argv[i++]); + if (rc < 0) + return i - 1; + } else { + *echo = 1; + } + + return i; +} + +static void +vs_est_timeout_cmd_cb(int fd, char *argv[], int argc) { + uint32_t vip; + uint16_t vport; + uint8_t proto; + uint8_t echo = 0; + uint32_t timeout; + int rc; + struct lb_virt_service *vs; + uint32_t socket_id; + + rc = vs_est_timeout_arg_parse(argv, argc, &vip, &vport, &proto, &echo, + &timeout); + if (rc != argc) { + unixctl_command_reply_error(fd, "Invalid parameter: %s.\n", argv[rc]); + return; + } + + VS_TBL_FOREACH_SOCKET(socket_id) { + vs = vs_tbl_find(lb_vs_tbls[socket_id], vip, vport, proto); + if (vs == NULL) { + unixctl_command_reply_error(fd, "Cannot find virt service.\n"); + return; + } + if (echo) { + unixctl_command_reply(fd, "%u\n", LB_CLOCK_TO_SEC(vs->est_timeout)); + return; + } + vs->est_timeout = SEC_TO_LB_CLOCK(timeout); + } +} + +UNIXCTL_CMD_REGISTER("vs/est_timeout", "VIP:VPORT tcp|udp [SEC].", + "Show or set TCP established timeout.", 2, 3, + vs_est_timeout_cmd_cb); + +static int +vs_scheduler_arg_parse(char *argv[], int argc, uint32_t *vip, uint16_t *vport, + uint8_t *proto, uint8_t *echo, + const struct lb_scheduler **sched) { + int rc; + int i = 0; + + /* ip:port */ + rc = parse_ipv4_port(argv[i++], vip, vport); + if (rc < 0) + return i - 1; + + /* proto */ + rc = parse_l4_proto(argv[i++], proto); + if (rc < 0) + return i - 1; + + if (i < argc) { + *echo = 0; + rc = lb_scheduler_lookup_by_name(argv[i++], sched); + if (rc < 0) { + return i - 1; + } + } else { + *echo = 1; + } + + return i; +} + +static void +vs_scheduler_cmd_cb(int fd, char *argv[], int argc) { + uint32_t vip; + uint16_t vport; + uint8_t proto; + uint8_t echo = 0; + const struct lb_scheduler *sched; + int rc; + struct lb_virt_service *vs; + struct lb_real_service *rs; + uint32_t socket_id; + + rc = + vs_scheduler_arg_parse(argv, argc, &vip, &vport, &proto, &echo, &sched); + if (rc != argc) { + unixctl_command_reply_error(fd, "Invalid parameter: %s.\n", argv[rc]); + return; + } + + VS_TBL_FOREACH_SOCKET(socket_id) { + vs = vs_tbl_find(lb_vs_tbls[socket_id], vip, vport, proto); + if (vs == NULL) { + unixctl_command_reply_error(fd, "Cannot find virt service.\n"); + return; + } + if (echo) { + unixctl_command_reply(fd, "%s\n", vs->sched->name); + return; + } + + if (sched == vs->sched) + break; + + LB_VS_WLOCK(vs); + LIST_FOREACH(rs, &vs->real_services, next) { + if (rs->flags & LB_RS_F_AVAILABLE) + vs->sched->del(vs, rs); + } + if (vs->sched->fini) + vs->sched->fini(vs); + + vs->sched = sched; + if (vs->sched->init && vs->sched->init(vs) < 0) { + LIST_FOREACH(rs, &vs->real_services, next) { + rs->flags &= ~LB_RS_F_AVAILABLE; + } + LB_VS_WUNLOCK(vs); + unixctl_command_reply_error(fd, "Cannot init scheduler %s.\n", + sched->name); + return; + } + + LIST_FOREACH(rs, &vs->real_services, next) { + if ((rs->flags & LB_RS_F_AVAILABLE) && vs->sched->add(vs, rs) < 0) { + rs->flags &= ~LB_RS_F_AVAILABLE; + } + } + LB_VS_WUNLOCK(vs); + } +} + +UNIXCTL_CMD_REGISTER("vs/scheduler", + "VIP:VPORT tcp|udp [iponly|ipport|rr|wrr].", + "Show or set scheduler.", 2, 3, vs_scheduler_cmd_cb); + +static int +vs_stats_arg_parse(char *argv[], int argc, uint32_t *vip, uint16_t *vport, + uint8_t *proto, int *json_fmt) { + int rc; + int i = 0; + + /* ip:port */ + rc = parse_ipv4_port(argv[i++], vip, vport); + if (rc < 0) + return i - 1; + + /* proto */ + rc = parse_l4_proto(argv[i++], proto); + if (rc < 0) + return i - 1; + + if (i < argc) { + *json_fmt = 1; + rc = strcmp(argv[i++], "--json"); + if (rc != 0) + return i - 1; + } else { + *json_fmt = 0; + } + + return i; +} + +static void +vs_stats_cmd_cb(int fd, char *argv[], int argc) { + uint32_t vip; + uint16_t vport; + uint8_t proto; + int json_fmt = 0; + int rc; + struct lb_virt_service *vs; + struct lb_real_service *rs; + uint32_t socket_id, lcore_id; + uint64_t rx_packets[2] = {0}, rx_bytes[2] = {0}, rx_drops[2] = {0}; + uint64_t tx_packets[2] = {0}, tx_bytes[2] = {0}; + uint64_t active_conns = 0, history_conns = 0; + + rc = vs_stats_arg_parse(argv, argc, &vip, &vport, &proto, &json_fmt); + if (rc != argc) { + unixctl_command_reply_error(fd, "Invalid parameter: %s.\n", argv[rc]); + return; + } + + VS_TBL_FOREACH_SOCKET(socket_id) { + vs = vs_tbl_find(lb_vs_tbls[socket_id], vip, vport, proto); + if (vs == NULL) { + unixctl_command_reply_error(fd, "Cannot find virt service.\n"); + return; + } + + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + rx_packets[0] += vs->stats[lcore_id].packets[0]; + rx_packets[1] += vs->stats[lcore_id].packets[1]; + rx_bytes[0] += vs->stats[lcore_id].bytes[0]; + rx_bytes[1] += vs->stats[lcore_id].bytes[1]; + rx_drops[0] += vs->stats[lcore_id].drops[0]; + rx_drops[1] += vs->stats[lcore_id].drops[1]; + history_conns += vs->stats[lcore_id].conns; + } + active_conns += (uint64_t)rte_atomic32_read(&vs->active_conns); + LIST_FOREACH(rs, &vs->real_services, next) { + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + tx_packets[0] += rs->stats[lcore_id].packets[0]; + tx_packets[1] += rs->stats[lcore_id].packets[1]; + tx_bytes[0] += rs->stats[lcore_id].bytes[0]; + tx_bytes[1] += rs->stats[lcore_id].bytes[1]; + } + } + } + + if (json_fmt) + unixctl_command_reply(fd, "{"); + + unixctl_command_reply(fd, + json_fmt ? JSON_KV_32_FMT("active-conns", ",") + : NORM_KV_32_FMT("active-conns", "\n"), + active_conns); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_32_FMT("history-conns", ",") + : NORM_KV_32_FMT("history-conns", "\n"), + history_conns); + + unixctl_command_reply(fd, + json_fmt ? JSON_KV_32_FMT("[c2v]packets", ",") + : NORM_KV_32_FMT("[c2v]packets", "\n"), + rx_packets[0]); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_32_FMT("[c2v]bytes", ",") + : NORM_KV_32_FMT("[c2v]bytes", "\n"), + rx_bytes[0]); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_32_FMT("[c2v]drops", ",") + : NORM_KV_32_FMT("[c2v]drops", "\n"), + rx_drops[0]); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_32_FMT("[r2v]packets", ",") + : NORM_KV_32_FMT("[r2v]packets", "\n"), + rx_packets[1]); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_32_FMT("[r2v]bytes", ",") + : NORM_KV_32_FMT("[r2v]bytes", "\n"), + rx_bytes[1]); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_32_FMT("[r2v]drops", "") + : NORM_KV_32_FMT("[r2v]drops", "\n"), + rx_drops[1]); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_32_FMT("[v2r]packets", ",") + : NORM_KV_32_FMT("[v2r]packets", "\n"), + tx_packets[0]); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_32_FMT("[v2r]bytes", ",") + : NORM_KV_32_FMT("[v2r]bytes", "\n"), + tx_bytes[0]); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_32_FMT("[v2c]packets", ",") + : NORM_KV_32_FMT("[v2c]packets", "\n"), + tx_packets[1]); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_32_FMT("[v2c]bytes", "") + : NORM_KV_32_FMT("[v2c]bytes", "\n"), + tx_bytes[1]); + + if (json_fmt) + unixctl_command_reply(fd, "}\n"); +} + +UNIXCTL_CMD_REGISTER("vs/stats", "VIP:VPORT tcp|udp [--json].", + "Show packet statistics of virtual service.", 2, 3, + vs_stats_cmd_cb); + +static int +rs_add_arg_parse(char *argv[], __attribute((unused)) int argc, uint32_t *vip, + uint16_t *vport, uint8_t *proto, uint32_t *rip, + uint16_t *rport, int *weight) { + int rc; + int i = 0; + + /* ip:port */ + rc = parse_ipv4_port(argv[i++], vip, vport); + if (rc < 0) { + return i - 1; + } + + /* proto */ + rc = parse_l4_proto(argv[i++], proto); + if (rc < 0) { + return i - 1; + } + + rc = parse_ipv4_port(argv[i++], rip, rport); + if (rc < 0) { + return i - 1; + } + + if (i < argc) { + rc = parser_read_uint16((uint16_t *)weight, argv[i++]); + if (rc < 0) + return i - 1; + } else { + *weight = 0; + } + + return i; +} + +static void +rs_add_cmd_cb(int fd, char *argv[], int argc) { + uint32_t vip; + uint16_t vport; + uint8_t proto; + uint32_t rip; + uint16_t rport; + int weight; + int rc; + uint32_t socket_id; + struct lb_virt_service *vss[RTE_MAX_NUMA_NODES] = {0}; + struct lb_real_service *rss[RTE_MAX_NUMA_NODES] = {0}; + + rc = rs_add_arg_parse(argv, argc, &vip, &vport, &proto, &rip, &rport, + &weight); + if (rc != argc) { + unixctl_command_reply_error(fd, "Invalid parameter: %s.\n", argv[rc]); + return; + } + + VS_TBL_FOREACH_SOCKET(socket_id) { + vss[socket_id] = vs_tbl_find(lb_vs_tbls[socket_id], vip, vport, proto); + if (vss[socket_id] == NULL) { + unixctl_command_reply_error(fd, "Cannot find virt service.\n"); + return; + } + } + + VS_TBL_FOREACH_SOCKET(socket_id) { + if (vs_find_rs(vss[socket_id], rip, rport) != NULL) { + unixctl_command_reply_error(fd, "Real service is exist.\n"); + return; + } + } + + VS_TBL_FOREACH_SOCKET(socket_id) { + rss[socket_id] = lb_rs_alloc(rip, rport, weight, vss[socket_id]); + if (rss[socket_id] == NULL) { + unixctl_command_reply_error(fd, "Not enough memory.\n"); + goto free_rss; + } + } + + VS_TBL_FOREACH_SOCKET(socket_id) { + LB_VS_WLOCK(vss[socket_id]); + lb_rs_list_insert_by_weight(vss[socket_id], rss[socket_id]); + rss[socket_id]->flags |= LB_RS_F_AVAILABLE; + rc = vss[socket_id]->sched->add(vss[socket_id], rss[socket_id]); + if (rc < 0) { + rss[socket_id]->flags &= ~LB_RS_F_AVAILABLE; + LIST_REMOVE(rss[socket_id], next); + LB_VS_WUNLOCK(vss[socket_id]); + unixctl_command_reply_error(fd, "Not enough memory.\n"); + goto del_sched; + } + LB_VS_WUNLOCK(vss[socket_id]); + } + + return; + +del_sched: + VS_TBL_FOREACH_SOCKET(socket_id) { + LB_VS_WLOCK(vss[socket_id]); + if (rss[socket_id]->flags & LB_RS_F_AVAILABLE) { + vss[socket_id]->sched->del(vss[socket_id], rss[socket_id]); + LIST_REMOVE(rss[socket_id], next); + } + LB_VS_WUNLOCK(vss[socket_id]); + } + +free_rss: + VS_TBL_FOREACH_SOCKET(socket_id) { lb_rs_free(rss[socket_id]); } +} + +UNIXCTL_CMD_REGISTER("rs/add", "VIP:VPORT tcp|udp RIP:RPORT [WEIGHT].", + "Add real service.", 3, 4, rs_add_cmd_cb); + +static int +rs_del_arg_parse(char *argv[], __attribute((unused)) int argc, uint32_t *vip, + uint16_t *vport, uint8_t *proto, uint32_t *rip, + uint16_t *rport) { + int rc; + int i = 0; + + /* ip:port */ + rc = parse_ipv4_port(argv[i++], vip, vport); + if (rc < 0) { + return i - 1; + } + + /* proto */ + rc = parse_l4_proto(argv[i++], proto); + if (rc < 0) { + return i - 1; + } + + rc = parse_ipv4_port(argv[i++], rip, rport); + if (rc < 0) { + return i - 1; + } + + return i; +} + +static void +rs_del_cmd_cb(int fd, char *argv[], int argc) { + uint32_t vip; + uint16_t vport; + uint8_t proto; + uint32_t rip; + uint16_t rport; + int rc; + uint32_t socket_id; + struct lb_virt_service *vss[RTE_MAX_NUMA_NODES] = {0}; + struct lb_real_service *rs; + + rc = rs_del_arg_parse(argv, argc, &vip, &vport, &proto, &rip, &rport); + if (rc != argc) { + unixctl_command_reply_error(fd, "Invalid parameter: %s.\n", argv[rc]); + return; + } + + VS_TBL_FOREACH_SOCKET(socket_id) { + vss[socket_id] = vs_tbl_find(lb_vs_tbls[socket_id], vip, vport, proto); + if (vss[socket_id] == NULL) { + unixctl_command_reply_error(fd, "Cannot find virt service.\n"); + return; + } + } + + VS_TBL_FOREACH_SOCKET(socket_id) { + rs = vs_find_rs(vss[socket_id], rip, rport); + if (rs == NULL) + continue; + + LB_VS_WLOCK(vss[socket_id]); + if (rs->flags & LB_RS_F_AVAILABLE) { + rs->flags &= ~LB_RS_F_AVAILABLE; + vss[socket_id]->sched->del(vss[socket_id], rs); + } + LIST_REMOVE(rs, next); + LB_VS_WUNLOCK(vss[socket_id]); + + lb_rs_free(rs); + } +} + +UNIXCTL_CMD_REGISTER("rs/del", "VIP:VPORT tcp|udp RIP:RPORT.", + "Del real service.", 3, 3, rs_del_cmd_cb); + +static int +rs_list_arg_parse(char *argv[], int argc, uint32_t *vip, uint16_t *vport, + uint8_t *proto, int *json_fmt) { + int rc; + int i = 0; + + /* ip:port */ + rc = parse_ipv4_port(argv[i++], vip, vport); + if (rc < 0) + return i - 1; + + /* proto */ + rc = parse_l4_proto(argv[i++], proto); + if (rc < 0) + return i - 1; + + if (i < argc) { + *json_fmt = 1; + rc = strcmp(argv[i++], "--json"); + if (rc != 0) + return i - 1; + } else { + *json_fmt = 0; + } + + return i; +} + +static void +rs_list_cmd_cb(int fd, char *argv[], int argc) { + uint32_t vip; + uint16_t vport; + uint8_t proto; + int json_fmt = 0, json_first_obj = 1; + int rc; + uint32_t socket_id; + struct lb_virt_service *vs = NULL; + struct lb_real_service *rs; + char buf[32]; + + rc = rs_list_arg_parse(argv, argc, &vip, &vport, &proto, &json_fmt); + if (rc != argc) { + unixctl_command_reply_error(fd, "Invalid parameter: %s.\n", argv[rc]); + return; + } + + VS_TBL_FOREACH_SOCKET(socket_id) { + vs = vs_tbl_find(lb_vs_tbls[socket_id], vip, vport, proto); + if (vs == NULL) { + unixctl_command_reply_error(fd, "Cannot find virt service.\n"); + return; + } + if (json_fmt) + unixctl_command_reply(fd, "["); + else + unixctl_command_reply( + fd, "IP Port Type Status Weight\n"); + LIST_FOREACH(rs, &vs->real_services, next) { + ipv4_addr_tostring(rs->rip, buf, sizeof(buf)); + if (json_fmt) { + unixctl_command_reply(fd, json_first_obj ? "{" : ",{"); + json_first_obj = 0; + unixctl_command_reply(fd, JSON_KV_S_FMT("ip", ","), buf); + unixctl_command_reply(fd, JSON_KV_32_FMT("port", ","), + rte_be_to_cpu_16(rs->rport)); + unixctl_command_reply(fd, JSON_KV_S_FMT("type", ","), + l4proto_format(rs->proto)); + unixctl_command_reply(fd, JSON_KV_S_FMT("status", ","), + rs->flags & LB_RS_F_AVAILABLE ? "up" + : "down"); + unixctl_command_reply(fd, JSON_KV_32_FMT("weight", "}"), + (uint32_t)rs->weight); + } else { + unixctl_command_reply( + fd, "%-15s %-5u %-4s %-6s %-10d\n", buf, + rte_be_to_cpu_16(rs->rport), l4proto_format(rs->proto), + rs->flags & LB_RS_F_AVAILABLE ? "up" : "down", rs->weight); + } + } + if (json_fmt) + unixctl_command_reply(fd, "]\n"); + + break; + } +} + +UNIXCTL_CMD_REGISTER("rs/list", "VIP:VPORT tcp|udp [--json].", + "List all real services.", 2, 3, rs_list_cmd_cb); + +static int +rs_status_arg_parse(char *argv[], int argc, uint32_t *vip, uint16_t *vport, + uint8_t *proto, uint32_t *rip, uint16_t *rport, + uint8_t *echo, uint8_t *op) { + int rc; + int i = 0; + + /* ip:port */ + rc = parse_ipv4_port(argv[i++], vip, vport); + if (rc < 0) { + return i - 1; + } + + /* proto */ + rc = parse_l4_proto(argv[i++], proto); + if (rc < 0) { + return i - 1; + } + + rc = parse_ipv4_port(argv[i++], rip, rport); + if (rc < 0) { + return i - 1; + } + + if (i < argc) { + *echo = 0; + rc = parser_read_uint8(op, argv[i++]); + if (rc < 0) + return i - 1; + } else { + *echo = 1; + } + + return i; +} + +static void +rs_status_cmd_cb(int fd, char *argv[], int argc) { + uint32_t vip, rip; + uint16_t vport, rport; + uint8_t proto; + uint8_t echo = 0; + uint8_t op; + int rc; + uint32_t socket_id; + struct lb_virt_service *vss[RTE_MAX_NUMA_NODES] = {0}; + struct lb_virt_service *vs; + struct lb_real_service *rs; + + rc = rs_status_arg_parse(argv, argc, &vip, &vport, &proto, &rip, &rport, + &echo, &op); + if (rc != argc) { + unixctl_command_reply_error(fd, "Invalid parameter: %s.\n", argv[rc]); + return; + } + + VS_TBL_FOREACH_SOCKET(socket_id) { + vss[socket_id] = vs_tbl_find(lb_vs_tbls[socket_id], vip, vport, proto); + if (vss[socket_id] == NULL) { + unixctl_command_reply_error(fd, "Cannot find virt service.\n"); + return; + } + } + + VS_TBL_FOREACH_SOCKET(socket_id) { + vs = vss[socket_id]; + rs = vs_find_rs(vs, rip, rport); + if (rs == NULL) { + unixctl_command_reply_error(fd, "Cannot find real service.\n"); + return; + } + + if (echo) { + unixctl_command_reply(fd, "%u\n", rs->flags & LB_RS_F_AVAILABLE); + return; + } + + if (rs->flags & LB_RS_F_AVAILABLE && !op) { + LB_VS_WLOCK(vs); + rs->flags &= ~LB_RS_F_AVAILABLE; + vs->sched->del(vs, rs); + LB_VS_WUNLOCK(vs); + } else if (!(rs->flags & LB_RS_F_AVAILABLE) && op) { + LB_VS_WLOCK(vs); + rs->flags |= LB_RS_F_AVAILABLE; + if (vs->sched->add(vs, rs) < 0) { + rs->flags &= ~LB_RS_F_AVAILABLE; + LB_VS_WUNLOCK(vs); + goto failed; + } + LB_VS_WUNLOCK(vs); + } + } + return; + +failed: + VS_TBL_FOREACH_SOCKET(socket_id) { + vs = vss[socket_id]; + if (rs->flags & LB_RS_F_AVAILABLE) { + LB_VS_WLOCK(vs); + rs->flags &= ~LB_RS_F_AVAILABLE; + vs->sched->del(vs, rs); + LB_VS_WUNLOCK(vs); + } + } +} + +UNIXCTL_CMD_REGISTER("rs/status", "VIP:VPORT tcp|udp RIP:RPORT [0|1].", + "Show or set the status of real services.", 3, 4, + rs_status_cmd_cb); + +static int +rs_weight_arg_parse(char *argv[], int argc, uint32_t *vip, uint16_t *vport, + uint8_t *proto, uint32_t *rip, uint16_t *rport, + uint8_t *echo, int *weight) { + int rc; + int i = 0; + + /* ip:port */ + rc = parse_ipv4_port(argv[i++], vip, vport); + if (rc < 0) { + return i - 1; + } + + /* proto */ + rc = parse_l4_proto(argv[i++], proto); + if (rc < 0) { + return i - 1; + } + + rc = parse_ipv4_port(argv[i++], rip, rport); + if (rc < 0) { + return i - 1; + } + + if (i < argc) { + *echo = 0; + rc = parser_read_uint16((uint16_t *)weight, argv[i++]); + if (rc < 0) + return i - 1; + } else { + *echo = 1; + } + + return i; +} + +static void +rs_weight_cmd_cb(int fd, char *argv[], int argc) { + uint32_t vip, rip; + uint16_t vport, rport; + uint8_t proto; + uint8_t echo = 0; + int weight; + int rc; + uint32_t socket_id; + struct lb_virt_service *vss[RTE_MAX_NUMA_NODES] = {0}; + struct lb_real_service *rs; + + rc = rs_weight_arg_parse(argv, argc, &vip, &vport, &proto, &rip, &rport, + &echo, &weight); + if (rc != argc) { + unixctl_command_reply_error(fd, "Invalid parameter: %s.\n", argv[rc]); + return; + } + + VS_TBL_FOREACH_SOCKET(socket_id) { + vss[socket_id] = vs_tbl_find(lb_vs_tbls[socket_id], vip, vport, proto); + if (vss[socket_id] == NULL) { + unixctl_command_reply_error(fd, "Cannot find virt service.\n"); + return; + } + } + + VS_TBL_FOREACH_SOCKET(socket_id) { + rs = vs_find_rs(vss[socket_id], rip, rport); + if (rs == NULL) { + unixctl_command_reply_error(fd, "Cannot find real service.\n"); + return; + } + if (echo) { + unixctl_command_reply(fd, "%d\n", rs->weight); + return; + } + + LB_VS_WLOCK(vss[socket_id]); + rs->weight = weight; + lb_rs_list_update_by_weight(vss[socket_id], rs); + vss[socket_id]->sched->update(vss[socket_id], rs); + LB_VS_WUNLOCK(vss[socket_id]); + } +} + +UNIXCTL_CMD_REGISTER("rs/weight", "VIP:VPORT tcp|udp RIP:RPORT [weight].", + "Show or set the weight of real services.", 3, 4, + rs_weight_cmd_cb); + +static int +rs_stats_arg_parse(char *argv[], int argc, uint32_t *vip, uint16_t *vport, + uint8_t *proto, uint32_t *rip, uint16_t *rport, + int *json_fmt) { + int rc; + int i = 0; + + /* ip:port */ + rc = parse_ipv4_port(argv[i++], vip, vport); + if (rc < 0) { + return i - 1; + } + + /* proto */ + rc = parse_l4_proto(argv[i++], proto); + if (rc < 0) { + return i - 1; + } + + rc = parse_ipv4_port(argv[i++], rip, rport); + if (rc < 0) { + return i - 1; + } + + if (i < argc) { + *json_fmt = 1; + rc = strcmp(argv[i++], "--json"); + if (rc != 0) + return i - 1; + } else { + *json_fmt = 0; + } + + return i; +} + +static void +rs_stats_cmd_cb(int fd, char *argv[], int argc) { + uint32_t vip; + uint16_t vport; + uint8_t proto; + uint32_t rip; + uint16_t rport; + int json_fmt = 0; + int rc; + uint32_t socket_id; + struct lb_virt_service *vs; + struct lb_real_service *rs; + uint32_t lcore_id; + uint64_t packets[2] = {0}, bytes[2] = {0}; + uint64_t active_conns = 0, history_conns = 0; + + rc = rs_stats_arg_parse(argv, argc, &vip, &vport, &proto, &rip, &rport, + &json_fmt); + if (rc != argc) { + unixctl_command_reply_error(fd, "Invalid parameter: %s.\n", argv[rc]); + return; + } + + VS_TBL_FOREACH_SOCKET(socket_id) { + vs = vs_tbl_find(lb_vs_tbls[socket_id], vip, vport, proto); + if (vs == NULL) { + unixctl_command_reply_error(fd, "Cannot find virt service.\n"); + return; + } + + rs = vs_find_rs(vs, rip, rport); + if (rs == NULL) { + unixctl_command_reply_error(fd, "Cannot find real service.\n"); + return; + } + + RTE_LCORE_FOREACH_SLAVE(lcore_id) { + packets[0] += rs->stats[lcore_id].packets[0]; + packets[1] += rs->stats[lcore_id].packets[1]; + bytes[0] += rs->stats[lcore_id].bytes[0]; + bytes[1] += rs->stats[lcore_id].bytes[1]; + history_conns += rs->stats[lcore_id].conns; + active_conns += (uint64_t)rte_atomic32_read(&rs->active_conns); + } + } + + if (json_fmt) + unixctl_command_reply(fd, "{"); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_32_FMT("active-conns", ",") + : NORM_KV_32_FMT("active-conns", "\n"), + active_conns); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_32_FMT("history-conns", ",") + : NORM_KV_32_FMT("history-conns", "\n"), + history_conns); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_32_FMT("[v2r]packets", ",") + : NORM_KV_32_FMT("[v2r]packets", "\n"), + packets[0]); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_32_FMT("[v2r]bytes", ",") + : NORM_KV_32_FMT("[v2r]bytes", "\n"), + bytes[0]); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_32_FMT("[r2v]packets", ",") + : NORM_KV_32_FMT("[r2v]packets", "\n"), + packets[1]); + unixctl_command_reply(fd, + json_fmt ? JSON_KV_32_FMT("[r2v]bytes", "") + : NORM_KV_32_FMT("[r2v]bytes", "\n"), + bytes[1]); + if (json_fmt) + unixctl_command_reply(fd, "}\n"); +} + +UNIXCTL_CMD_REGISTER("rs/stats", "VIP:VPORT tcp|udp RIP:RPORT.", + "Show the packet stats of real services.", 3, 4, + rs_stats_cmd_cb); + diff --git a/core/lb_service.h b/core/lb_service.h new file mode 100644 index 0000000..32ec908 --- /dev/null +++ b/core/lb_service.h @@ -0,0 +1,90 @@ +/* Copyright (c) 2018. TIG developer. */ + +#ifndef __LB_SERVICE_H__ +#define __LB_SERVICE_H__ + +#include + +#include +#include + +#include "lb_proto.h" +#include "lb_scheduler.h" + +#define LB_MAX_VS (1 << 16) + +#define LB_VS_F_SYNPROXY (0x01) +#define LB_VS_F_TOA (0x02) +#define LB_VS_F_CQL (0x04) + +#define LB_RS_F_AVAILABLE (0x1) + +struct lb_service_stats { + uint64_t packets[LB_DIR_MAX]; + uint64_t bytes[LB_DIR_MAX]; + uint64_t drops[LB_DIR_MAX]; + uint64_t conns; +}; + +struct lb_real_service; + +struct lb_virt_service { + uint32_t vip; + uint16_t vport; + uint8_t proto; + + uint32_t est_timeout; + int max_conns; + rte_atomic32_t active_conns; + rte_atomic32_t refcnt; + + uint32_t flags; + + uint32_t socket_id; + + rte_rwlock_t rwlock; + + const struct lb_scheduler *sched; + void *sched_data; + + LIST_HEAD(, lb_real_service) real_services; + + struct lb_service_stats stats[RTE_MAX_LCORE]; +}; + +struct lb_real_service { + LIST_ENTRY(lb_real_service) next; + uint32_t rip; + uint16_t rport; + uint8_t proto; + + uint32_t flags; + + rte_atomic32_t active_conns; + rte_atomic32_t refcnt; + + int weight; + + struct lb_virt_service *virt_service; + void *sched_node; + + struct lb_service_stats stats[RTE_MAX_LCORE]; +}; + +int lb_is_vip_exist(uint32_t vip); +struct lb_virt_service *lb_vs_get(uint32_t vip, uint16_t vport, uint8_t proto); +void lb_vs_put(struct lb_virt_service *vs); +struct lb_real_service *lb_vs_get_rs(struct lb_virt_service *vs, uint32_t cip, + uint16_t cport); +void lb_vs_put_rs(struct lb_real_service *rs); +void lb_vs_free(struct lb_virt_service *vs); +void lb_rs_free(struct lb_real_service *rs); +int lb_service_init(void); + +static inline int +lb_vs_check_max_conn(struct lb_virt_service *vs) { + return rte_atomic32_read(&vs->active_conns) >= vs->max_conns; +} + +#endif + diff --git a/core/lb_synproxy.c b/core/lb_synproxy.c new file mode 100644 index 0000000..87eadc4 --- /dev/null +++ b/core/lb_synproxy.c @@ -0,0 +1,536 @@ +/* Copyright (c) 2018. TIG developer. */ + +#include +#include + +#include "lb_conn.h" +#include "lb_md5.h" +#include "lb_proto.h" +#include "lb_service.h" +#include "lb_synproxy.h" +#include "lb_tcp_secret_seq.h" +#include "lb_toa.h" + +// #define SYNPROXY_DEBUG +#ifdef SYNPROXY_DEBUG +#define SYNPROXY_PRINT(...) \ + do { \ + fprintf(stderr, "SYNPROXY: "); \ + fprintf(stderr, __VA_ARGS__); \ + } while (0) +#else +#define SYNPROXY_PRINT(...) \ + do { \ + } while (0) +#endif + +extern uint32_t tcp_timeouts[TCP_CONNTRACK_MAX]; + +static inline void +tcp_conn_set_state(struct lb_conn *conn, uint32_t state) { + uint32_t timeout = 0; + + if (state == TCP_CONNTRACK_ESTABLISHED) { + timeout = conn->real_service->virt_service->est_timeout; + } + if (timeout == 0) + timeout = tcp_timeouts[TCP_CONNTRACK_ESTABLISHED]; + conn->state = state; + conn->timeout = timeout; +} + +static const uint16_t msstab[] = {536, 1300, 1440, 1460}; + +static uint32_t net_secret[2][MD5_MESSAGE_BYTES / 4] __rte_cache_aligned; + +static void +synproxy_net_secret_init(void) { + int i; + static uint8_t inited = 0; + + if (likely(inited)) + return; + + for (i = 0; i < MD5_MESSAGE_BYTES / 4; i++) { + net_secret[0][i] = rte_rand(); + net_secret[1][i] = rte_rand(); + } + + inited = 1; +}; + +#define COOKIEBITS 24 /* Upper bits store count */ +#define COOKIEMASK (((uint32_t)1 << COOKIEBITS) - 1) + +#define COUNTER_TRIES 4 + +static inline uint32_t +tcp_cookie_time(void) { + /* 64s */ + return (uint32_t)(rte_get_tsc_cycles() / (rte_get_tsc_hz() * 60)); +} + +static inline uint32_t +cookie_hash(uint32_t saddr, uint32_t daddr, uint16_t sport, uint16_t dport, + uint32_t count, int c) { + uint32_t hash[4]; + + synproxy_net_secret_init(); + + hash[0] = saddr; + hash[1] = daddr; + hash[2] = (sport << 16) + dport; + hash[3] = count; + + md5_transform(hash, net_secret[c]); + + return hash[0]; +} + +static uint32_t +secure_tcp_syn_cookie(uint32_t saddr, uint32_t daddr, uint16_t sport, + uint16_t dport, uint32_t sseq, uint32_t count, + uint32_t data) { + return (cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq + + (count << COOKIEBITS) + + ((cookie_hash(saddr, daddr, sport, dport, count, 1) + data) & + COOKIEMASK)); +} + +static uint32_t +check_tcp_syn_cookie(uint32_t cookie, uint32_t saddr, uint32_t daddr, + uint16_t sport, uint16_t dport, uint32_t sseq, + uint32_t count, uint32_t maxdiff) { + uint32_t diff; + + cookie -= cookie_hash(saddr, daddr, sport, dport, 0, 0) + sseq; + + diff = (count - (cookie >> COOKIEBITS)) & ((uint32_t)-1 >> COOKIEBITS); + if (diff >= maxdiff) + return (uint32_t)-1; + + return (cookie - cookie_hash(saddr, daddr, sport, dport, count - diff, 1)) & + COOKIEMASK; +} + +uint32_t +synproxy_cookie_ipv4_init_sequence(struct ipv4_hdr *iph, struct tcp_hdr *th, + struct synproxy_options *opts) { + int mssid; + const uint16_t mss = opts->mss_clamp; + uint32_t data = 0; + + for (mssid = RTE_DIM(msstab) - 1; mssid; mssid--) + if (mss >= msstab[mssid]) + break; + + data = ((mssid & 0x0f) << LB_SYNPROXY_MSS_BITS); + data |= opts->sack_ok << LB_SYNPROXY_SACKOK_BIT; + data |= opts->tstamp_ok << LB_SYNPROXY_TSOK_BIT; + data |= ((opts->snd_wscale & 0x0f) << LB_SYNPROXY_SND_WSCALE_BITS); + + return secure_tcp_syn_cookie(iph->src_addr, iph->dst_addr, th->src_port, + th->dst_port, rte_be_to_cpu_32(th->sent_seq), + tcp_cookie_time(), data); +} + +uint32_t +synproxy_cookie_ipv4_check(struct ipv4_hdr *iph, struct tcp_hdr *th, + struct synproxy_options *opts) { + uint32_t rc; + uint32_t cookie; + uint32_t sseq; + uint32_t mssid; + + cookie = rte_be_to_cpu_32(th->recv_ack) - 1; + sseq = rte_be_to_cpu_32(th->sent_seq) - 1; + rc = check_tcp_syn_cookie(cookie, iph->src_addr, iph->dst_addr, + th->src_port, th->dst_port, sseq, + tcp_cookie_time(), COUNTER_TRIES); + if (rc == (uint32_t)-1) + return 0; + + mssid = (rc & LB_SYNPROXY_MSS_MASK) >> LB_SYNPROXY_MSS_BITS; + + memset(opts, 0, sizeof(struct synproxy_options)); + if ((mssid < RTE_DIM(msstab)) && ((rc & LB_SYNPROXY_OTHER_MASK) == 0)) { + opts->mss_clamp = msstab[mssid]; + opts->sack_ok = + (rc & LB_SYNPROXY_SACKOK_MASK) >> LB_SYNPROXY_SACKOK_BIT; + opts->tstamp_ok = (rc & LB_SYNPROXY_TSOK_MASK) >> LB_SYNPROXY_TSOK_BIT; + opts->snd_wscale = + (rc & LB_SYNPROXY_SND_WSCALE_MASK) >> LB_SYNPROXY_SND_WSCALE_BITS; + if (opts->snd_wscale > 0 && opts->snd_wscale <= LB_SYNPROXY_WSCALE_MAX) + opts->wscale_ok = 1; + else if (opts->snd_wscale == 0) + opts->wscale_ok = 0; + else + return 0; + + return 1; + } + + return 0; +} + +static void +synproxy_parse_set_options(struct tcp_hdr *th, struct synproxy_options *opts) { + uint8_t *ptr; + int len; + uint32_t *tmp; + + memset(opts, 0, sizeof(*opts)); + opts->mss_clamp = 1460; + + ptr = (uint8_t *)(th + 1); + len = (th->data_off >> 2) - sizeof(*th); + while (len > 0) { + int opcode = *ptr++; + int opsize; + + switch (opcode) { + case TCPOPT_EOL: + return; + case TCPOPT_NOP: + len--; + continue; + default: + opsize = *ptr++; + if (opsize < 2) + return; + if (opsize > len) + return; + + switch (opcode) { + case TCPOPT_MSS: + if (opsize == TCPOLEN_MSS) { + opts->mss_clamp = rte_be_to_cpu_16(*((uint16_t *)ptr)); + if (opts->mss_clamp > 1460) + opts->mss_clamp = 1460; + } + break; + case TCPOPT_WINDOW: + if (opsize == TCPOLEN_WINDOW) { + opts->wscale_ok = 1; + opts->snd_wscale = *ptr; + if (opts->snd_wscale > 14) + opts->snd_wscale = 14; + } + break; + case TCPOPT_TIMESTAMP: + if (opsize == TCPOLEN_TIMESTAMP) { + /*opts->tstamp_ok = 1; + tmp = (uint32_t *)ptr; + *(tmp + 1) = *tmp; + *tmp = rte_cpu_to_be_32((uint32_t)rte_get_tsc_cycles());*/ + *(ptr - 2) = TCPOPT_NOP; + *(ptr - 1) = TCPOPT_NOP; + tmp = (uint32_t *)ptr; + *tmp++ = 0x01010101; + *tmp = 0x01010101; + } + break; + case TCPOPT_SACK_PERM: + if (opsize == TCPOLEN_SACK_PERM) { + // opts->sack_ok = 1; + *(ptr - 2) = TCPOPT_NOP; + *(ptr - 1) = TCPOPT_NOP; + } + break; + } + ptr += opsize - 2; + len -= opsize; + } + } +} + +static uint16_t +synproxy_options_size(const struct synproxy_options *opts) { + return TCPOLEN_MSS + (opts->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) + + (opts->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) + + ((opts->sack_ok && !opts->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0); +} + +void +synproxy_seq_adjust_client(struct tcp_hdr *th, struct synproxy *proxy) { + struct lb_conn *conn = container_of(proxy, struct lb_conn, proxy); + + if (!(conn->flags & LB_CONN_F_SYNPROXY)) + return; + th->recv_ack = + rte_cpu_to_be_32(rte_be_to_cpu_32(th->recv_ack) + proxy->oft); +} + +void +synproxy_seq_adjust_backend(struct tcp_hdr *th, struct synproxy *proxy) { + struct lb_conn *conn = container_of(proxy, struct lb_conn, proxy); + + if (!(conn->flags & LB_CONN_F_SYNPROXY)) + return; + th->sent_seq = + rte_cpu_to_be_32(rte_be_to_cpu_32(th->sent_seq) - proxy->oft); +} + +static void +synproxy_sent_client_synack(struct rte_mbuf *m, struct ipv4_hdr *iph, + struct tcp_hdr *th, uint16_t port_id) { + struct synproxy_options opts; + uint32_t isn; + uint16_t pkt_len; + uint32_t tmpaddr; + uint16_t tmpport; + + synproxy_parse_set_options(th, &opts); + isn = synproxy_cookie_ipv4_init_sequence(iph, th, &opts); + + pkt_len = m->data_len; + rte_pktmbuf_reset(m); + m->pkt_len = m->data_len = pkt_len; + + iph->time_to_live = 63; + iph->type_of_service = 0; + tmpaddr = iph->src_addr; + iph->src_addr = iph->dst_addr; + iph->dst_addr = tmpaddr; + iph->hdr_checksum = 0; + iph->hdr_checksum = rte_ipv4_cksum(iph); + + tmpport = th->src_port; + th->src_port = th->dst_port; + th->dst_port = tmpport; + th->recv_ack = rte_cpu_to_be_32(rte_be_to_cpu_32(th->sent_seq) + 1); + th->sent_seq = rte_cpu_to_be_32(isn); + th->tcp_flags = TCP_SYN_FLAG | TCP_ACK_FLAG; + th->tcp_urp = 0; + th->cksum = 0; + th->cksum = rte_ipv4_udptcp_cksum(iph, th); + + lb_device_output(m, iph, port_id); +} + +int +synproxy_recv_client_syn(struct rte_mbuf *m, struct ipv4_hdr *iph, + struct tcp_hdr *th, uint16_t port_id) { + struct lb_virt_service *vs = NULL; + + if (SYN(th) && !ACK(th) && !RST(th) && !FIN(th) && + (vs = lb_vs_get(iph->dst_addr, th->dst_port, iph->next_proto_id)) && + (vs->flags & LB_VS_F_SYNPROXY)) { + if (lb_vs_check_max_conn(vs)) + synproxy_sent_client_synack(m, iph, th, port_id); + lb_vs_put(vs); + return 0; + } else { + if (vs != NULL) + lb_vs_put(vs); + return 1; + } +} + +static void +synproxy_syn_build_options(uint32_t *ptr, struct synproxy_options *opts) { + *ptr++ = rte_cpu_to_be_32((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | + opts->mss_clamp); + if (opts->tstamp_ok) { + if (opts->sack_ok) + *ptr++ = rte_cpu_to_be_32( + (TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | + (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + else + *ptr++ = + rte_cpu_to_be_32((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + *ptr++ = rte_cpu_to_be_32((uint32_t)rte_get_tsc_cycles()); /* TSVAL */ + *ptr++ = 0; /* TSECR */ + } else if (opts->sack_ok) + *ptr++ = rte_cpu_to_be_32((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); + if (opts->wscale_ok) + *ptr++ = rte_cpu_to_be_32((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | + (TCPOLEN_WINDOW << 8) | (opts->snd_wscale)); +} + +static void +synproxy_sent_backend_syn(struct rte_mbuf *m, struct ipv4_hdr *iph, + struct tcp_hdr *th, struct lb_conn *conn, + struct synproxy_options *opts, uint16_t port_id) { + struct tcp_hdr *nth; + uint16_t win; + uint16_t tcphdr_size; + struct rte_mbuf *mcopy; + + /* For tcp seq adjust. */ + tcp_secret_seq_init(conn->lip, conn->rip, conn->lport, conn->rport, + rte_be_to_cpu_32(th->sent_seq) - 1, &conn->tseq); + win = th->rx_win; + tcphdr_size = sizeof(struct tcp_hdr) + synproxy_options_size(opts); + rte_pktmbuf_reset(m); + m->pkt_len = m->data_len = + ETHER_HDR_LEN + sizeof(struct ipv4_hdr) + tcphdr_size; + + iph->total_length = rte_cpu_to_be_16(sizeof(struct ipv4_hdr) + tcphdr_size); + iph->type_of_service = 0; + iph->time_to_live = 63; + iph->src_addr = conn->lip; + iph->dst_addr = conn->rip; + iph->hdr_checksum = 0; + iph->hdr_checksum = rte_ipv4_cksum(iph); + + nth = (struct tcp_hdr *)(iph + 1); + nth->src_port = conn->lport; + nth->dst_port = conn->rport; + nth->sent_seq = rte_cpu_to_be_32(conn->tseq.isn); + nth->recv_ack = 0; + nth->data_off = tcphdr_size << 2; + nth->tcp_flags = TCP_SYN_FLAG; + nth->rx_win = win; + nth->tcp_urp = 0; + + synproxy_syn_build_options((uint32_t *)(nth + 1), opts); + + nth->cksum = 0; + nth->cksum = rte_ipv4_udptcp_cksum(iph, nth); + + mcopy = rte_pktmbuf_clone(m, m->pool); + if (mcopy != NULL) { + /* Fix m->port for BOND dev. */ + mcopy->port = port_id; + conn->proxy.syn_mbuf = mcopy; + } + + lb_device_output(m, iph, port_id); +} + +int +synproxy_recv_client_ack(struct rte_mbuf *m, struct ipv4_hdr *iph, + struct tcp_hdr *th, struct lb_conn_table *ct, + uint16_t port_id) { + struct synproxy_options opts; + struct lb_virt_service *vs = NULL; + struct lb_real_service *rs = NULL; + struct lb_conn *conn = NULL; + + if (!SYN(th) && ACK(th) && !RST(th) && !FIN(th) && + (vs = lb_vs_get(iph->dst_addr, th->dst_port, iph->next_proto_id)) && + (vs->flags & LB_VS_F_SYNPROXY)) { + if (synproxy_cookie_ipv4_check(iph, th, &opts) && + (rs = lb_vs_get_rs(vs, iph->src_addr, th->src_port)) && + (conn = lb_conn_new(ct, iph->src_addr, th->src_port, rs, 1, + port_id))) { + tcp_conn_set_state(conn, TCP_CONNTRACK_SYN_SENT); + + conn->proxy.isn = rte_be_to_cpu_32(th->recv_ack) - 1; + + synproxy_sent_backend_syn(m, iph, th, conn, &opts, port_id); + } + + lb_vs_put(vs); + if (conn == NULL && rs != NULL) + lb_vs_put_rs(rs); + return 0; + } else { + if (vs != NULL) + lb_vs_put(vs); + return 1; + } +} + +static void +synproxy_sent_ack_to_backend(struct rte_mbuf *m, struct lb_conn *conn, + uint16_t port_id) { + struct ipv4_hdr *iph; + struct tcp_hdr *th; + + iph = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, ETHER_HDR_LEN); + iph->src_addr = conn->lip; + iph->dst_addr = conn->rip; + + th = TCP_HDR(iph); + th->src_port = conn->lport; + th->dst_port = conn->rport; + tcp_secret_seq_adjust_client(th, &conn->tseq); + synproxy_seq_adjust_client(th, &conn->proxy); + tcp_opt_add_toa(m, iph, th, conn->cip, conn->cport); + iph->hdr_checksum = 0; + iph->hdr_checksum = rte_ipv4_cksum(iph); + th->cksum = 0; + th->cksum = rte_ipv4_udptcp_cksum(iph, th); + + lb_device_output(m, iph, port_id); +} + +static void +synproxy_fwd_synack_to_client(struct rte_mbuf *m, struct ipv4_hdr *iph, + struct tcp_hdr *th, struct lb_conn *conn, + uint16_t port_id) { + iph->src_addr = conn->vip; + iph->dst_addr = conn->cip; + th->src_port = conn->vport; + th->dst_port = conn->cport; + synproxy_seq_adjust_backend(th, &conn->proxy); + tcp_secret_seq_adjust_backend(th, &conn->tseq); + iph->hdr_checksum = 0; + iph->hdr_checksum = rte_ipv4_cksum(iph); + th->cksum = 0; + th->cksum = rte_ipv4_udptcp_cksum(iph, th); + + lb_device_output(m, iph, port_id); +} + +static void +synproxy_fwd_rst_to_client(struct rte_mbuf *m, struct ipv4_hdr *iph, + struct tcp_hdr *th, struct lb_conn *conn, + uint16_t port_id) { + iph->src_addr = conn->vip; + iph->dst_addr = conn->cip; + th->src_port = conn->vport; + th->dst_port = conn->cport; + th->sent_seq = rte_cpu_to_be_32(conn->proxy.isn + 1); + th->recv_ack = 0; + th->tcp_flags = TCP_RST_FLAG; + iph->hdr_checksum = 0; + iph->hdr_checksum = rte_ipv4_cksum(iph); + th->cksum = 0; + th->cksum = rte_ipv4_udptcp_cksum(iph, th); + + lb_device_output(m, iph, port_id); +} + +int +synproxy_recv_backend_synack(struct rte_mbuf *m, struct ipv4_hdr *iph, + struct tcp_hdr *th, struct lb_conn *conn, + uint16_t port_id) { + if (SYN(th) && ACK(th) && !RST(th) && !FIN(th) && + (conn->flags & LB_CONN_F_SYNPROXY) && + (conn->state == TCP_CONNTRACK_SYN_SENT)) { + + conn->proxy.oft = rte_be_to_cpu_32(th->sent_seq) - conn->proxy.isn; + + rte_pktmbuf_free(conn->proxy.syn_mbuf); + conn->proxy.syn_mbuf = NULL; + + if (conn->proxy.ack_mbuf != NULL) { + tcp_conn_set_state(conn, TCP_CONNTRACK_ESTABLISHED); + + /* Free SYNACK, and send ACK to backend. */ + rte_pktmbuf_free(m); + synproxy_sent_ack_to_backend(conn->proxy.ack_mbuf, conn, port_id); + conn->proxy.ack_mbuf = NULL; + } else { + tcp_conn_set_state(conn, TCP_CONNTRACK_SYN_RECV); + + /* FWD SYNACK to client. */ + synproxy_fwd_synack_to_client(m, iph, th, conn, port_id); + } + return 0; + } else if (RST(th) && (conn->flags & LB_CONN_F_SYNPROXY) && + (conn->state == TCP_CONNTRACK_SYN_SENT)) { + tcp_conn_set_state(conn, TCP_CONNTRACK_CLOSE); + + /* FWD RST to client. */ + synproxy_fwd_rst_to_client(m, iph, th, conn, port_id); + return 0; + } + return 1; +} + diff --git a/core/lb_synproxy.h b/core/lb_synproxy.h new file mode 100644 index 0000000..5f2f251 --- /dev/null +++ b/core/lb_synproxy.h @@ -0,0 +1,70 @@ +/* Copyright (c) 2018. TIG developer. */ + +#ifndef __LB_SYNPROXY_H__ +#define __LB_SYNPROXY_H__ + +#include +#include +#include + +struct lb_conn; +struct lb_conn_table; + +/* Add MASKs for TCP OPT in "data" coded in cookie */ +/* |[21][20][19-16][15-0]| + * [21] SACK + * [20] TimeStamp + * [19-16] snd_wscale + * [15-0] MSSIND + */ +#define LB_SYNPROXY_OTHER_BITS 12 +#define LB_SYNPROXY_OTHER_MASK (((uint32_t)1 << LB_SYNPROXY_OTHER_BITS) - 1) +#define LB_SYNPROXY_MSS_BITS 12 +#define LB_SYNPROXY_MSS_MASK ((uint32_t)0xf << LB_SYNPROXY_MSS_BITS) + +#define LB_SYNPROXY_SACKOK_BIT 21 +#define LB_SYNPROXY_SACKOK_MASK ((uint32_t)1 << LB_SYNPROXY_SACKOK_BIT) + +#define LB_SYNPROXY_TSOK_BIT 20 +#define LB_SYNPROXY_TSOK_MASK ((uint32_t)1 << LB_SYNPROXY_TSOK_BIT) + +#define LB_SYNPROXY_SND_WSCALE_BITS 16 +#define LB_SYNPROXY_SND_WSCALE_MASK \ + ((uint32_t)0xf << LB_SYNPROXY_SND_WSCALE_BITS) + +#define LB_SYNPROXY_WSCALE_MAX 14 + +struct synproxy_options { + uint16_t snd_wscale : 8, /* Window scaling received from sender */ + tstamp_ok : 1, /* TIMESTAMP seen on SYN packet */ + wscale_ok : 1, /* Wscale seen on SYN packet */ + sack_ok : 1; /* SACK seen on SYN packet */ + uint16_t mss_clamp; /* Maximal mss, negotiated at connection setup */ +}; + +struct synproxy { + struct rte_mbuf *syn_mbuf; + struct rte_mbuf *ack_mbuf; + uint32_t syn_retry; + uint32_t isn; + uint32_t oft; +}; + +uint32_t synproxy_cookie_ipv4_init_sequence(struct ipv4_hdr *iph, + struct tcp_hdr *th, + struct synproxy_options *opts); +uint32_t synproxy_cookie_ipv4_check(struct ipv4_hdr *iph, struct tcp_hdr *th, + struct synproxy_options *opts); +int synproxy_recv_backend_synack(struct rte_mbuf *m, struct ipv4_hdr *iph, + struct tcp_hdr *th, struct lb_conn *conn, + uint16_t port_id); +int synproxy_recv_client_ack(struct rte_mbuf *m, struct ipv4_hdr *iph, + struct tcp_hdr *th, struct lb_conn_table *ct, + uint16_t port_id); +int synproxy_recv_client_syn(struct rte_mbuf *m, struct ipv4_hdr *iph, + struct tcp_hdr *th, uint16_t port_id); +void synproxy_seq_adjust_client(struct tcp_hdr *th, struct synproxy *proxy); +void synproxy_seq_adjust_backend(struct tcp_hdr *th, struct synproxy *proxy); + +#endif + diff --git a/core/lb_tcp_secret_seq.c b/core/lb_tcp_secret_seq.c new file mode 100644 index 0000000..f77fba8 --- /dev/null +++ b/core/lb_tcp_secret_seq.c @@ -0,0 +1,47 @@ +/* Copyright (c) 2018. TIG developer. */ + +#include +#include + +#include "lb_md5.h" +#include "lb_tcp_secret_seq.h" + +#ifndef __rte_cache_aligned +#define __rte_cache_aligned __rte_aligned(RTE_CACHE_LINE_SIZE) +#endif + +static uint32_t seq_secret[MD5_MESSAGE_BYTES / 4] __rte_cache_aligned; + +static void +seq_secret_init(void) { + int i; + static uint8_t inited = 0; + + if (likely(inited)) + return; + + for (i = 0; i < MD5_MESSAGE_BYTES / 4; i++) { + seq_secret[i] = rte_rand(); + } + + inited = 1; +} + +uint32_t +tcp_secret_new_seq(uint32_t saddr, uint32_t daddr, uint16_t sport, + uint16_t dport) { + uint32_t hash[MD5_DIGEST_WORDS]; + uint64_t ns; + + seq_secret_init(); + + hash[0] = saddr; + hash[1] = daddr; + hash[2] = (sport << 16) + dport; + hash[3] = seq_secret[15]; + + md5_transform(hash, seq_secret); + ns = rte_get_tsc_cycles() / ((rte_get_tsc_hz() + NS_PER_S - 1) / NS_PER_S); + return hash[0] + (uint32_t)(ns >> 6); +} + diff --git a/core/lb_tcp_secret_seq.h b/core/lb_tcp_secret_seq.h new file mode 100644 index 0000000..c1c2365 --- /dev/null +++ b/core/lb_tcp_secret_seq.h @@ -0,0 +1,35 @@ +/* Copyright (c) 2018. TIG developer. */ + +#ifndef __LB_TCP_SECRET_SEQ_H__ +#define __LB_TCP_SECRET_SEQ_H__ + +#include +#include + +struct tcp_secret_seq { + uint32_t isn; + uint32_t oft; +}; + +uint32_t tcp_secret_new_seq(uint32_t saddr, uint32_t daddr, uint16_t sport, + uint16_t dport); + +static inline void +tcp_secret_seq_init(uint32_t saddr, uint32_t daddr, uint16_t sport, + uint16_t dport, uint32_t isn, struct tcp_secret_seq *tseq) { + tseq->isn = tcp_secret_new_seq(saddr, daddr, sport, dport); + tseq->oft = tseq->isn - isn; +} + +static inline void +tcp_secret_seq_adjust_client(struct tcp_hdr *th, struct tcp_secret_seq *tseq) { + th->sent_seq = rte_cpu_to_be_32(rte_be_to_cpu_32(th->sent_seq) + tseq->oft); +} + +static inline void +tcp_secret_seq_adjust_backend(struct tcp_hdr *th, struct tcp_secret_seq *tseq) { + th->recv_ack = rte_cpu_to_be_32(rte_be_to_cpu_32(th->recv_ack) - tseq->oft); +} + +#endif + diff --git a/core/lb_toa.c b/core/lb_toa.c new file mode 100644 index 0000000..12952db --- /dev/null +++ b/core/lb_toa.c @@ -0,0 +1,44 @@ +/* Copyright (c) 2018. TIG developer. */ + +#include +#include +#include + +#include "lb_toa.h" + +#define TCPOPT_ADDR 200 +#define TCPOLEN_ADDR 8 /* |opcode|size|ip+port| = 1 + 1 + 6 */ + +struct tcp_opt_toa { + uint8_t optcode; + uint8_t optsize; + uint16_t port; + uint32_t addr; +} __attribute__((__packed__)); + +void +tcp_opt_add_toa(struct rte_mbuf *m, struct ipv4_hdr *iph, struct tcp_hdr *th, + uint32_t sip, uint16_t sport) { + struct tcp_opt_toa *toa; + uint8_t *p, *q; + + /* tcp header max length */ + if ((60 - (th->data_off >> 2)) < (int)sizeof(struct tcp_opt_toa)) + return; + p = (uint8_t *)rte_pktmbuf_append(m, sizeof(struct tcp_opt_toa)); + q = p + sizeof(struct tcp_opt_toa); + while (p >= ((uint8_t *)th + (th->data_off >> 2))) { + *q = *p; + q--; + p--; + } + toa = (struct tcp_opt_toa *)((uint8_t *)th + (th->data_off >> 2)); + toa->optcode = TCPOPT_ADDR; + toa->optsize = TCPOLEN_ADDR; + toa->port = sport; + toa->addr = sip; + th->data_off += (sizeof(struct tcp_opt_toa) / 4) << 4; + iph->total_length = rte_cpu_to_be_16(rte_be_to_cpu_16(iph->total_length) + + sizeof(struct tcp_opt_toa)); +} + diff --git a/core/lb_toa.h b/core/lb_toa.h new file mode 100644 index 0000000..4b6d532 --- /dev/null +++ b/core/lb_toa.h @@ -0,0 +1,10 @@ +/* Copyright (c) 2018. TIG developer. */ + +#ifndef __LB_TOA_H__ +#define __LB_TOA_H__ + +void tcp_opt_add_toa(struct rte_mbuf *m, struct ipv4_hdr *iph, + struct tcp_hdr *th, uint32_t sip, uint16_t sport); + +#endif + diff --git a/core/main.c b/core/main.c new file mode 100644 index 0000000..9bf9b8c --- /dev/null +++ b/core/main.c @@ -0,0 +1,439 @@ +/* Copyright (c) 2018. TIG developer. */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include "lb_arp.h" +#include "lb_clock.h" +#include "lb_config.h" +#include "lb_device.h" +#include "lb_format.h" +#include "lb_parser.h" +#include "lb_proto.h" +#include "lb_service.h" + +#define VERSION "0.1" + +#define RUN_ONCE_N_MS(f, n) \ + do { \ + static uint64_t last_tsc = 0; \ + uint64_t curr_tsc; \ + \ + curr_tsc = rte_rdtsc(); \ + if (curr_tsc - last_tsc >= MS_TO_CYCLES(n)) { \ + f(); \ + last_tsc = curr_tsc; \ + } \ + } while (0) + +/* unixctl command */ +static int unixctl_fd; +static struct rte_timer unixctl_timer; + +/* clock */ +rte_atomic32_t lb_clock; +static struct rte_timer lb_clock_timer; + +#define SOCK_FILEPATH "/var/run/jupiter.sock" +#define PID_FILEPATH "/var/run/jupiter.pid" +#define DEFAULT_CONF_FILEPATH "/etc/jupiter/jupiter.cfg" + +static const char *lb_cfgfile = DEFAULT_CONF_FILEPATH; +static const char *lb_procname; +static int lb_daemon = 0; + +static void +lb_clock_timer_cb(__attribute__((unused)) struct rte_timer *t, + __attribute__((unused)) void *arg) { + rte_atomic32_t *clock = arg; + + rte_atomic32_inc(clock); +} + +static int +lb_clock_timer_init(void) { + uint64_t ticks; + + rte_atomic32_init(&lb_clock); + rte_timer_init(&lb_clock_timer); + /* 10ms */ + ticks = MS_TO_CYCLES((MS_PER_S + LB_CLOCK_HZ - 1) / LB_CLOCK_HZ); + return rte_timer_reset(&lb_clock_timer, ticks, PERIODICAL, + rte_get_master_lcore(), lb_clock_timer_cb, + &lb_clock); +} + +static void +unixctl_server_timer_cb(__attribute__((unused)) struct rte_timer *t, + __attribute__((unused)) void *arg) { + unixctl_server_run_once(unixctl_fd); +} + +static int +unixctl_server_init(const char *path) { + unixctl_fd = unixctl_server_create(path); + if (unixctl_fd < 0) { + RTE_LOG(ERR, USER1, "%s(): unixctl_server_create failed, path = %s.\n", + __func__, path); + return -1; + } + rte_timer_init(&unixctl_timer); + return rte_timer_reset(&unixctl_timer, MS_TO_CYCLES(5), PERIODICAL, + rte_get_master_lcore(), unixctl_server_timer_cb, + NULL); +} + +static void +handle_packets(struct rte_mbuf **pkts, uint16_t n, uint16_t port_id) { + uint16_t i; + struct lb_device *dev; + struct rte_mbuf *m; + struct ether_hdr *eth; + struct ipv4_hdr *iph; + struct lb_proto *p; + + dev = &lb_devices[port_id]; + for (i = 0; i < n; i++) { + m = pkts[i]; + + eth = rte_pktmbuf_mtod_offset(m, struct ether_hdr *, 0); + switch (rte_be_to_cpu_16(eth->ether_type)) { + case ETHER_TYPE_ARP: + if (rte_ring_enqueue(dev->ring, m) < 0) { + rte_pktmbuf_free(m); + } + break; + case ETHER_TYPE_IPv4: + iph = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, ETHER_HDR_LEN); + if (iph->dst_addr == dev->ipv4) { + if (rte_ring_enqueue(dev->ring, m) < 0) { + rte_pktmbuf_free(m); + } + } else { + p = lb_proto_get(iph->next_proto_id); + if (p != NULL) { + p->fullnat_handle(m, iph, port_id); + } else { + rte_pktmbuf_free(m); + } + } + break; + default: + rte_pktmbuf_free(m); + } + } +} +/* +#include + +#define IPV4_HLEN(iph) (((iph)->version_ihl & IPV4_HDR_IHL_MASK) << 2) +#define TCP_HDR(iph) (struct tcp_hdr *)((char *)(iph) + IPV4_HLEN(iph)) + +static int +drop_ack(struct rte_mbuf *m) +{ + struct ether_hdr *eth; + struct ipv4_hdr *iph; + struct tcp_hdr *th; + + eth = rte_pktmbuf_mtod_offset(m, struct ether_hdr *, 0); + if (eth->ether_type != rte_be_to_cpu_16(0x0800)) + return 0; + iph = rte_pktmbuf_mtod_offset(m, struct ipv4_hdr *, ETHER_HDR_LEN); + if (iph->next_proto_id != IPPROTO_TCP) + return 0; + th = TCP_HDR(iph); + if (th->tcp_flags & TCP_ACK_FLAG && !(th->tcp_flags & TCP_SYN_FLAG) && +!(th->tcp_flags & TCP_FIN_FLAG)) return 1; return 0; +}*/ + +static int +master_loop(__attribute__((unused)) void *arg) { + uint32_t lcore_id; + uint16_t i, j, nb_ports; + uint16_t nb_ctx; + struct { + uint16_t port_id; + uint16_t txq_id; + struct rte_eth_dev_tx_buffer *tx_buffer; + struct rte_kni *kni; + struct rte_ring *ring; + } ctx[RTE_MAX_ETHPORTS]; + struct rte_mbuf *pkts[PKT_MAX_BURST]; + uint32_t n, nb_tx; + struct ether_hdr *ethh; + + lcore_id = rte_lcore_id(); + nb_ctx = 0; + nb_ports = rte_eth_dev_count(); + for (i = 0; i < nb_ports; i++) { + if (lb_devices[i].type != LB_DEV_T_NORM && + lb_devices[i].type != LB_DEV_T_MASTER) { + continue; + } + + ctx[nb_ctx].port_id = i; + ctx[nb_ctx].txq_id = lb_devices[i].lcore_conf[lcore_id].txq_id; + ctx[nb_ctx].tx_buffer = lb_devices[i].tx_buffer[lcore_id]; + ctx[nb_ctx].kni = lb_devices[i].kni; + ctx[nb_ctx].ring = lb_devices[i].ring; + + nb_ctx++; + } + + if (nb_ctx == 0) { + RTE_LOG(INFO, USER1, "%s(): master thread exit early.\n", __func__); + return 0; + } + + RTE_LOG(INFO, USER1, "%s(): master thread started.\n", __func__); + + while (1) { + for (i = 0; i < nb_ctx; i++) { + rte_kni_handle_request(ctx[i].kni); + + n = rte_kni_rx_burst(ctx[i].kni, pkts, PKT_MAX_BURST); + + for (j = 0; j < n; j++) { + rte_eth_tx_buffer(n, ctx[i].txq_id, ctx[i].tx_buffer, pkts[j]); + } + + rte_eth_tx_buffer_flush(ctx[i].port_id, ctx[i].txq_id, + ctx[i].tx_buffer); + + n = rte_ring_dequeue_burst(ctx[i].ring, (void **)pkts, + PKT_MAX_BURST, NULL); + for (j = 0; j < n; j++) { + ethh = rte_pktmbuf_mtod_offset(pkts[j], struct ether_hdr *, 0); + if (ethh->ether_type == rte_be_to_cpu_16(ETHER_TYPE_ARP)) { + lb_arp_input(pkts[j], ctx[i].port_id); + } + } + nb_tx = rte_kni_tx_burst(ctx[i].kni, pkts, n); + for (j = nb_tx; j < n; j++) { + rte_pktmbuf_free(pkts[j]); + } + } + + RUN_ONCE_N_MS(rte_timer_manage, 1); + } + + return 0; +} + +static int +worker_loop(__attribute__((unused)) void *arg) { + uint32_t lcore_id; + uint16_t i, nb_ports; + uint16_t nb_ctx; + struct { + uint16_t port_id; + uint16_t rxq_id, txq_id; + struct rte_eth_dev_tx_buffer *tx_buffer; + struct rte_mbuf *rx_pkts[PKT_MAX_BURST]; + uint32_t n; + } ctx[RTE_MAX_ETHPORTS]; + + lcore_id = rte_lcore_id(); + nb_ctx = 0; + nb_ports = rte_eth_dev_count(); + for (i = 0; i < nb_ports; i++) { + if (lb_devices[i].type != LB_DEV_T_NORM && + lb_devices[i].type != LB_DEV_T_MASTER) { + continue; + } + + ctx[nb_ctx].port_id = i; + ctx[nb_ctx].rxq_id = lb_devices[i].lcore_conf[lcore_id].rxq_id; + ctx[nb_ctx].txq_id = lb_devices[i].lcore_conf[lcore_id].txq_id; + ctx[nb_ctx].tx_buffer = lb_devices[i].tx_buffer[lcore_id]; + + nb_ctx++; + } + + if (nb_ctx == 0) { + RTE_LOG(INFO, USER1, "%s(): worker%u thread exit early.\n", __func__, + lcore_id); + return 0; + } + + RTE_LOG(INFO, USER1, "%s(): worker%u thread started.\n", __func__, + lcore_id); + + while (1) { + for (i = 0; i < nb_ctx; i++) { + rte_eth_tx_buffer_flush(ctx[i].port_id, ctx[i].txq_id, + ctx[i].tx_buffer); + } + + for (i = 0; i < nb_ctx; i++) { + ctx[i].n = rte_eth_rx_burst(ctx[i].port_id, ctx[i].rxq_id, + ctx[i].rx_pkts, PKT_MAX_BURST); + } + + for (i = 0; i < nb_ctx; i++) { + handle_packets(ctx[i].rx_pkts, ctx[i].n, ctx[i].port_id); + } + + RUN_ONCE_N_MS(rte_timer_manage, 1); + } + + return 0; +} + +static int +main_loop(void *arg) { + if (rte_get_master_lcore() == rte_lcore_id()) { + return master_loop(arg); + } else { + return worker_loop(arg); + } +} + +static void +usage(const char *progname) { + printf("usage: %s [--conf=%s] [--daemon] [--version] [--help]\n", progname, + DEFAULT_CONF_FILEPATH); + exit(0); +} + +static void +parse_args(int argc, char **argv) { + int i; + + if ((lb_procname = strrchr(argv[0], '/')) != NULL) + lb_procname = strdup(lb_procname + 1); + else + lb_procname = strdup(argv[0]); + + for (i = 1; i < argc; i++) { + if (strncmp(argv[i], "--conf=", 7) == 0) { + lb_cfgfile = strdup(argv[i] + 7); + } else if (strncmp(argv[i], "--daemon", 8) == 0) { + lb_daemon = 1; + } else if (strcmp(argv[i], "--version") == 0) { + printf("Version: %s\n", VERSION); + exit(0); + } else if (strcmp(argv[i], "--help") == 0) { + usage(lb_procname); + } else { + printf("Unknow options: %s\n", argv[i]); + usage(lb_procname); + } + } +} + +static void +create_daemon(void) { + if (daemon(0, 0) < 0) { + printf("%s(): Daemon failed.\n", __func__); + exit(-1); + } +} + +static void +proc_check_running(void) { + int fd; + pid_t pid; + char buf[32]; + + fd = open(PID_FILEPATH, O_RDWR | O_CREAT); + if (fd < 0) { + printf("can not open %s\n", PID_FILEPATH); + exit(-1); + } + if (flock(fd, LOCK_EX | LOCK_NB) < 0) { + printf("%s is running.\n", lb_procname); + exit(-1); + } + ftruncate(fd, 0); + lseek(fd, 0, SEEK_SET); + + pid = getpid(); + snprintf(buf, sizeof(buf), "%d", pid); + if (write(fd, buf, strlen(buf)) < 0) { + printf("write pid to %s failed.\n", PID_FILEPATH); + exit(-1); + } +} + +int +main(int argc, char **argv) { + int rc; + + parse_args(argc, argv); + if (lb_daemon) + create_daemon(); + + proc_check_running(); + + rc = lb_config_file_load(lb_cfgfile); + if (rc < 0) { + printf("%s(): Load config file %s failed.\n", __func__, lb_cfgfile); + return rc; + } + + rc = rte_eal_init(lb_cfg->dpdk.argc, lb_cfg->dpdk.argv); + if (rc < 0) { + RTE_LOG(ERR, USER1, "%s(): rte_eal_init failed.\n", __func__); + return rc; + } + + rte_timer_subsystem_init(); + rte_pdump_init(NULL); + + rc = lb_device_init(lb_cfg->devices, lb_cfg->nb_decices); + if (rc < 0) { + RTE_LOG(ERR, USER1, "%s(): lb_device_init failed.\n", __func__); + return rc; + } + + rc = lb_arp_init(); + if (rc < 0) { + RTE_LOG(ERR, USER1, "%s(): lb_arp_init failed.\n", __func__); + return rc; + } + + rc = lb_clock_timer_init(); + if (rc < 0) { + RTE_LOG(ERR, USER1, "%s(): lb_clock_timer_init failed.\n", __func__); + return rc; + } + + rc = unixctl_server_init(SOCK_FILEPATH); + if (rc < 0) { + RTE_LOG(ERR, USER1, "%s(): unixctl_server_init failed.\n", __func__); + return rc; + } + + rc = lb_service_init(); + if (rc < 0) { + RTE_LOG(ERR, USER1, "%s(): lb_service_init failed.\n", __func__); + return rc; + } + + rc = lb_proto_init(); + if (rc < 0) { + RTE_LOG(ERR, USER1, "%s(): lb_proto_init failed.\n", __func__); + return rc; + } + + rc = rte_eal_mp_remote_launch(main_loop, NULL, CALL_MASTER); + if (rc < 0) { + RTE_LOG(ERR, USER1, "%s(): Launch remote thread failed.\n", __func__); + return rc; + } + + return 0; +} + diff --git a/jupiter.cfg b/jupiter.cfg new file mode 100644 index 0000000..3e6654e --- /dev/null +++ b/jupiter.cfg @@ -0,0 +1,30 @@ +[DPDK] +argv = -c 0xf00 -n 4 + +[DEVICE0] +name = jupiter0 +ipv4 = 192.168.1.1 +netmask = 255.255.0.0 +gw = 192.168.1.254 +rxqsize = 512 +txqsize = 512 +mtu = 1500 +rxoffload = 0 +txoffload = 0 +local-ipv4 = 192.168.2.10/28 +pci = 00:00.0 + +; more devices: + +; [DEVICE1] +; name = jupiter1 +; ipv4 = 192.168.1.1 +; netmask = 255.255.0.0 +; gw = 192.168.1.254 +; rxqsize = +; txqsize = +; mtu = 1500 +; rxoffload = 0 +; txoffload = 0 +; local-ipv4 = 192.168.2.10/8 +; pci = 00:00.0 \ No newline at end of file diff --git a/lib/Makefile b/lib/Makefile new file mode 100644 index 0000000..bd09ab3 --- /dev/null +++ b/lib/Makefile @@ -0,0 +1,24 @@ +# Copyright (c) 2018. TIG developer. + +ifdef V +Q = +else +Q = @ +endif + +ifeq ($(RTE_SDK),) +$(error "Please define RTE_SDK environment variable") +endif + +# Default target, can be overriden by command line or environment +RTE_TARGET ?= x86_64-native-linuxapp-gcc + +.PHONY: all +all: + $(Q)cd libcmd && $(MAKE) O=$(RTE_TARGET) + $(Q)cd libconhash && $(MAKE) O=$(RTE_TARGET) + +.PHONY: clean +clean: + $(Q)cd libcmd && $(MAKE) O=$(RTE_TARGET) clean + $(Q)cd libconhash && $(MAKE) O=$(RTE_TARGET) clean \ No newline at end of file diff --git a/lib/libcmd/Makefile b/lib/libcmd/Makefile new file mode 100644 index 0000000..67b8c3a --- /dev/null +++ b/lib/libcmd/Makefile @@ -0,0 +1,18 @@ +# Copyright (c) 2018. TIG developer. + +ifeq ($(RTE_SDK),) +$(error "Please define RTE_SDK environment variable") +endif + +include $(RTE_SDK)/mk/rte.vars.mk + +# binary name +LIB = libcmd.a + +# all source are stored in SRCS-y +SRCS-y := unixctl_command.c +SYMLINK-y-include += unixctl_command.h + +CFLAGS += $(WERROR_FLAGS) -g -O3 + +include $(RTE_SDK)/mk/rte.extlib.mk diff --git a/lib/libcmd/unixctl_command.c b/lib/libcmd/unixctl_command.c new file mode 100644 index 0000000..822a71d --- /dev/null +++ b/lib/libcmd/unixctl_command.c @@ -0,0 +1,305 @@ +/* Copyright (c) 2018. TIG developer. */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "unixctl_command.h" + +#define FALSE 0 +#define TRUE 1 + +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + +#ifndef offsetof +/** Return the offset of a field in a structure. */ +#define offsetof(TYPE, MEMBER) __builtin_offsetof(TYPE, MEMBER) +#endif + +#define CMDMSG_HDR_SIZE offsetof(struct unixctl_cmd_message, data) +#define CMDMSG_DATA_SIZE 512 + +#define CMDMSG_T_REQUEST 0x0 +#define CMDMSG_T_REPLY 0x1 + +#define CMDMSG_S_SUCCESS 0x0 +#define CMDMSG_S_FAIL 0x1 + +struct unixctl_cmd_message { + /* msg header */ + uint8_t type; + uint8_t status; + uint16_t data_size; + /* msg body */ + char data[CMDMSG_DATA_SIZE]; +} __attribute((packed)); + +#define CMD_MAX_OPTIONS 32 + +struct unixctl_cmd_head unixctl_cmd_entries = + TAILQ_HEAD_INITIALIZER(unixctl_cmd_entries); + +static int +read_cmd_message(int fd, struct unixctl_cmd_message *cmdmsg) { + struct iovec iov; + struct msghdr msgh = {0}; + int ret; + + memset(cmdmsg, 0, sizeof(*cmdmsg)); + iov.iov_base = cmdmsg; + iov.iov_len = CMDMSG_HDR_SIZE; + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + ret = recvmsg(fd, &msgh, 0); + if (ret <= 0) + return ret; + if (msgh.msg_flags & MSG_TRUNC) + return -1; + if (cmdmsg->data_size > 0) { + ret = read(fd, cmdmsg->data, cmdmsg->data_size); + if (ret != (int)cmdmsg->data_size) + return -1; + } + return ret; +} + +static int +write_cmd_message(int fd, struct unixctl_cmd_message *cmdmsg) { + struct iovec iov = {0}; + struct msghdr msgh = {0}; + int ret; + + iov.iov_base = cmdmsg; + iov.iov_len = CMDMSG_HDR_SIZE + cmdmsg->data_size; + msgh.msg_iov = &iov; + msgh.msg_iovlen = 1; + do { + ret = sendmsg(fd, &msgh, MSG_NOSIGNAL); + } while (ret < 0 && errno == EINTR); + return ret; +} + +static int +__unixctl_command_reply(int fd, int err, const char *buf, size_t buf_len) { + struct unixctl_cmd_message cmdmsg; + size_t wlen, offset = 0; + + while ((wlen = MIN(buf_len, CMDMSG_DATA_SIZE)) > 0) { + memset(&cmdmsg, 0, sizeof(cmdmsg)); + cmdmsg.type = CMDMSG_T_REPLY; + cmdmsg.status = err ? CMDMSG_S_FAIL : CMDMSG_S_SUCCESS; + cmdmsg.data_size = wlen; + strncpy(cmdmsg.data, buf + offset, wlen); + if (write_cmd_message(fd, &cmdmsg) < 0) + return -1; + buf_len -= wlen; + offset += wlen; + } + return 0; +} + +int +unixctl_command_reply(int fd, const char *format, ...) { + char buf[CMDMSG_DATA_SIZE]; + va_list ap; + int buf_len; + + va_start(ap, format); + buf_len = vsnprintf(buf, CMDMSG_DATA_SIZE, format, ap); + va_end(ap); + if (buf_len < 0) + return -1; + return __unixctl_command_reply(fd, FALSE, buf, buf_len); +} + +int +unixctl_command_reply_error(int fd, const char *format, ...) { + char buf[CMDMSG_DATA_SIZE]; + va_list ap; + int buf_len; + + va_start(ap, format); + buf_len = vsnprintf(buf, CMDMSG_DATA_SIZE, format, ap); + va_end(ap); + if (buf_len < 0) + return -1; + return __unixctl_command_reply(fd, TRUE, buf, buf_len); +} + +static void +unixctl_list_command_cb(int fd, __attribute__((unused)) char *argv[], + __attribute__((unused)) int argc) { + struct unixctl_cmd_entry *entry; + + unixctl_command_reply(fd, "All Commands:\n"); + TAILQ_FOREACH(entry, &unixctl_cmd_entries, next) { + unixctl_command_reply(fd, " %-25s %-45s %s\n", entry->name, + entry->usage, entry->summary); + } +} + +static struct unixctl_cmd_entry * +unixctl_cmd_lookup_by_name(const char *name) { + struct unixctl_cmd_entry *entry; + + TAILQ_FOREACH(entry, &unixctl_cmd_entries, next) { + if (strcmp(entry->name, name) == 0) + return entry; + } + return NULL; +} + +#define PARSE_DELIMITER " \f\n\r\t\v" +static int +parse_tokenize_string(char *string, char *tokens[], uint32_t *n_tokens) { + uint32_t i; + + if ((string == NULL) || (tokens == NULL) || (*n_tokens < 1)) + return -EINVAL; + + for (i = 0; i < *n_tokens; i++) { + tokens[i] = strtok_r(string, PARSE_DELIMITER, &string); + if (tokens[i] == NULL) + break; + } + + if ((i == *n_tokens) && + (NULL != strtok_r(string, PARSE_DELIMITER, &string))) + return -E2BIG; + + *n_tokens = i; + return 0; +} + +void +unixctl_server_run_once(int unixctl_server_fd) { + struct unixctl_cmd_message cmdmsg; + int cfd; + char *tokens[CMD_MAX_OPTIONS]; + uint32_t n_tokens = CMD_MAX_OPTIONS; + struct unixctl_cmd_entry *entry; + + cfd = accept(unixctl_server_fd, NULL, NULL); + if (cfd < 0) + return; + if (read_cmd_message(cfd, &cmdmsg) <= 0) + goto end; + if (parse_tokenize_string(cmdmsg.data, tokens, &n_tokens) < 0) { + unixctl_command_reply_error(cfd, + "Unixctl_cmd: Cannot process command with " + "more than %u parameters.\n", + n_tokens); + goto end; + } + if (n_tokens == 0) { + unixctl_list_command_cb(cfd, NULL, 0); + goto end; + } + entry = unixctl_cmd_lookup_by_name(tokens[0]); + if (!entry) { + unixctl_command_reply_error(cfd, "Unixctl_cmd: Unknow command %s.\n", + tokens[0]); + goto end; + } + if (entry->min_argc > n_tokens - 1) { + unixctl_command_reply_error( + cfd, "Unixctl_cmd: Too few parameters for command %s.\n", + entry->name); + goto end; + } + if (entry->max_argc < n_tokens - 1) { + unixctl_command_reply_error( + cfd, "Unixctl_cmd: Too many parameters for command %s.\n", + entry->name); + goto end; + } + if (entry->cb) + entry->cb(cfd, tokens + 1, n_tokens - 1); +end: + close(cfd); +} + +int +unixctl_server_create(const char *path) { + struct sockaddr_un un; + int fd; + + if (!path) + return -1; + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) + return -1; + if (fcntl(fd, F_SETFL, O_NONBLOCK) < 0) + return -1; + memset(&un, 0, sizeof(un)); + un.sun_family = AF_UNIX; + strcpy(un.sun_path, path); + unlink(path); + if (bind(fd, (struct sockaddr *)&un, sizeof(un)) < 0) + return -1; + if (listen(fd, 10) < 0) + return -1; + + return fd; +} + +void +unixctl_server_destory(int fd, const char *path) { + close(fd); + unlink(path); +} + +int +unixctl_client_create(const char *path) { + struct sockaddr_un un; + int fd; + + fd = socket(AF_UNIX, SOCK_STREAM, 0); + if (fd < 0) { + return -1; + } + memset(&un, 0, sizeof(un)); + un.sun_family = AF_UNIX; + strcpy(un.sun_path, path); + if (connect(fd, (struct sockaddr *)&un, sizeof(un)) < 0) { + close(fd); + return -1; + } + return fd; +} + +void +unixctl_client_destory(int fd, __attribute__((unused)) const char *path) { + close(fd); +} + +int +unixctl_client_request(int fd, const char *cmdline) { + struct unixctl_cmd_message cmdmsg = {0}; + + cmdmsg.data_size = strlen(cmdline); + strncpy(cmdmsg.data, cmdline, cmdmsg.data_size); + if (write_cmd_message(fd, &cmdmsg) < 0) { + return -1; + } + + while (read_cmd_message(fd, &cmdmsg) > 0) { + if (cmdmsg.status == CMDMSG_S_FAIL) { + fprintf(stderr, "%s", cmdmsg.data); + return 1; + } + fprintf(stdout, "%s", cmdmsg.data); + fflush(stdout); + } + return 0; +} + diff --git a/lib/libcmd/unixctl_command.h b/lib/libcmd/unixctl_command.h new file mode 100644 index 0000000..410c247 --- /dev/null +++ b/lib/libcmd/unixctl_command.h @@ -0,0 +1,50 @@ +/* Copyright (c) 2018. TIG developer. */ + +#ifndef __UNIXCTL_COMMAND_H__ +#define __UNIXCTL_COMMAND_H__ + +#include + +#include + +typedef void unixctl_command_cb_func(int, char **, int); + +struct unixctl_cmd_entry { + TAILQ_ENTRY(unixctl_cmd_entry) next; + const char *name; + const char *usage; + const char *summary; + uint16_t min_argc, max_argc; + unixctl_command_cb_func *cb; +}; + +TAILQ_HEAD(unixctl_cmd_head, unixctl_cmd_entry); + +extern struct unixctl_cmd_head unixctl_cmd_entries; + +#define UNIXCTL_CMD_REGISTER(n, u, s, min, max, f) \ + struct unixctl_cmd_entry cmd_##f = { \ + .name = n, \ + .usage = u, \ + .summary = s, \ + .min_argc = min, \ + .max_argc = max, \ + .cb = f, \ + }; \ + __attribute__((constructor)) static void unixctl_cmd_register_##f(void) { \ + TAILQ_INSERT_TAIL(&unixctl_cmd_entries, &cmd_##f, next); \ + } + +int unixctl_command_reply(int fd, const char *format, ...); +int unixctl_command_reply_error(int fd, const char *format, ...); + +int unixctl_server_create(const char *path); +void unixctl_server_destory(int fd, const char *path); +void unixctl_server_run_once(int unixctl_server_fd); + +int unixctl_client_create(const char *path); +void unixctl_client_destory(int fd, const char *path); +int unixctl_client_request(int fd, const char *cmdline); + +#endif + diff --git a/lib/libconhash/LICENSE b/lib/libconhash/LICENSE new file mode 100644 index 0000000..a11bdc0 --- /dev/null +++ b/lib/libconhash/LICENSE @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2010. sparkling.liang@hotmail.com. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ diff --git a/lib/libconhash/Makefile b/lib/libconhash/Makefile new file mode 100644 index 0000000..2982629 --- /dev/null +++ b/lib/libconhash/Makefile @@ -0,0 +1,18 @@ +# Copyright (c) 2018. TIG developer. + +ifeq ($(RTE_SDK),) +$(error "Please define RTE_SDK environment variable") +endif + +include $(RTE_SDK)/mk/rte.vars.mk + +# binary name +LIB = libconhash.a + +# all source are stored in SRCS-y +SRCS-y := conhash.c conhash_inter.c conhash_util.c md5.c util_rbtree.c +SYMLINK-y-include += conhash.h configure.h + +CFLAGS += $(WERROR_FLAGS) -g -O3 + +include $(RTE_SDK)/mk/rte.extlib.mk diff --git a/lib/libconhash/README b/lib/libconhash/README new file mode 100644 index 0000000..590cc5d --- /dev/null +++ b/lib/libconhash/README @@ -0,0 +1,22 @@ + +What is libconhash? +libconhash is a consistent hashing libraray, which can be compiled both on Windows and Linux platform, with the following features: + +1. High performance and easy to use, libconhash uses a red-black tree to manange all nodes to achieve high performance. +2. By default it uses MD5 algorithm, but it also supports user-defined hash function. +3. Easy to scale according to node's processing capacity. + +-------------------------------------------------------------------------------------- +To build libconhash +on Linux using + make +to build a debug version, using + make CFLAG=DEBUG +on Windows there are win32 projects, just build them + +-------------------------------------------------------------------------------------- +To use libconhash +Include headers libconhash.h and configure.h, and link the conhash binary library file. +There is a sample in the project shows how to use the library. + + diff --git a/lib/libconhash/configure.h b/lib/libconhash/configure.h new file mode 100644 index 0000000..6835a83 --- /dev/null +++ b/lib/libconhash/configure.h @@ -0,0 +1,12 @@ + +/* Copyright (C) 2010. sparkling.liang@hotmail.com. All rights reserved. */ + +#ifndef __CONFIGURE_H_ +#define __CONFIGURE_H_ + + +typedef unsigned int u_int; +typedef unsigned char u_char; +typedef long util_long; + +#endif /* end __CONFIGURE_H_ */ diff --git a/lib/libconhash/conhash.c b/lib/libconhash/conhash.c new file mode 100644 index 0000000..a28b598 --- /dev/null +++ b/lib/libconhash/conhash.c @@ -0,0 +1,111 @@ + +/* Copyright (C) 2010. sparkling.liang@hotmail.com. All rights reserved. */ + +#include "conhash.h" +#include "conhash_inter.h" + +struct conhash_s* conhash_init(conhash_cb_hashfunc pfhash) +{ + /* alloc memory and set to zero */ + struct conhash_s *conhash = (struct conhash_s*)calloc(1, sizeof(struct conhash_s)); + if(conhash == NULL) + { + return NULL; + } + do + { + /* setup callback functions */ + if(pfhash != NULL) + { + conhash->cb_hashfunc = pfhash; + } + else + { + conhash->cb_hashfunc = __conhash_hash_def; + } + util_rbtree_init(&conhash->vnode_tree); + return conhash; + + }while(0); + + free(conhash); + return NULL; +} + +void conhash_fini(struct conhash_s *conhash) +{ + if(conhash != NULL) + { + /* free rb tree */ + while(!util_rbtree_isempty(&(conhash->vnode_tree))) + { + util_rbtree_node_t *rbnode = conhash->vnode_tree.root; + util_rbtree_delete(&(conhash->vnode_tree), rbnode); + __conhash_del_rbnode(rbnode); + } + free(conhash); + } +} + +void conhash_set_node(struct node_s *node, const char *iden, u_int replica, void *userdata) +{ + strncpy(node->iden, iden, sizeof(node->iden)-1); + node->replicas = replica; + node->flag = NODE_FLAG_INIT; + node->userdata = userdata; +} + +int conhash_add_node(struct conhash_s *conhash, struct node_s *node) +{ + if((conhash==NULL) || (node==NULL)) + { + return -1; + } + /* check node fisrt */ + if(!(node->flag&NODE_FLAG_INIT) || (node->flag&NODE_FLAG_IN)) + { + return -1; + } + node->flag |= NODE_FLAG_IN; + /* add replicas of server */ + __conhash_add_replicas(conhash, node); + + return 0; +} + +int conhash_del_node(struct conhash_s *conhash, struct node_s *node) +{ + if((conhash==NULL) || (node==NULL)) + { + return -1; + } + /* check node first */ + if(!(node->flag&NODE_FLAG_INIT) || !(node->flag&NODE_FLAG_IN)) + { + return -1; + } + node->flag &= (~NODE_FLAG_IN); + /* add replicas of server */ + __conhash_del_replicas(conhash, node); + + return 0; +} + +struct node_s* conhash_lookup(struct conhash_s *conhash, const char *object, u_int length) +{ + long hash; + util_rbtree_node_t *rbnode; + if((conhash==NULL) || (conhash->ivnodes==0) || object == NULL) + { + return NULL; + } + /* calc hash value */ + hash = conhash->cb_hashfunc(object, length); + rbnode = util_rbtree_lookup(&(conhash->vnode_tree), hash); + if(rbnode != NULL) + { + struct virtual_node_s *vnode = rbnode->data; + return vnode->node; + } + return NULL; +} diff --git a/lib/libconhash/conhash.h b/lib/libconhash/conhash.h new file mode 100644 index 0000000..8dfd94e --- /dev/null +++ b/lib/libconhash/conhash.h @@ -0,0 +1,98 @@ + +/* Copyright (C) 2010. sparkling.liang@hotmail.com. All rights reserved. */ + +#ifndef __CON_HASH_H_ +#define __CON_HASH_H_ + +#include "configure.h" +#include +#include + +#ifdef CONHASH_EXPORTS + +/* windows platform DLL */ +#if (defined (WIN32) || defined (__WIN32)) && (defined _USRDLL) +#define CONHASH_API __declspec(dllexport) +#else +#define CONHASH_API __declspec(dllimport) +#endif + +#else /* Linux, or static lib */ +#define CONHASH_API +#endif + +#define NODE_FLAG_INIT 0x01 /* node is initialized */ +#define NODE_FLAG_IN 0x02 /* node is added in the server */ + +/* nodes structure */ +struct node_s +{ + char iden[64]; /* node name or some thing identifies the node */ + u_int replicas; /* number of replica virtual nodes */ + u_int flag; + void *userdata; +}; + +/* + * callback function to calculate hash value + * @instr: input string + */ +typedef long (*conhash_cb_hashfunc)(const char *, u_int); + +struct conhash_s; + +/* export interfaces */ +#ifdef __cplusplus +extern "C" { +#endif + /* initialize conhash library + * @pfhash : hash function, NULL to use default MD5 method + * return a conhash_s instance + */ + CONHASH_API struct conhash_s* conhash_init(conhash_cb_hashfunc pfhash); + + /* finalize lib */ + CONHASH_API void conhash_fini(struct conhash_s *conhash); + + /* set node */ + CONHASH_API void conhash_set_node(struct node_s *node, const char *iden, u_int replica, void *userdata); + + /* + * add a new node + * @node: the node to add + */ + CONHASH_API int conhash_add_node(struct conhash_s *conhash, struct node_s *node); + + /* remove a node */ + CONHASH_API int conhash_del_node(struct conhash_s *conhash, struct node_s *node); + + /* + * update a node's virtual nodes + * @replica: new replica of server + * return 0 success, -1 failed + */ + CONHASH_API int conhash_update_node(struct conhash_s *conhash, struct node_s *node, u_int replica); + + /* + * lookup a server which object belongs to + * @object: the input string which indicates an object + * return the server_s structure, do not modify the value, or it will cause a disaster + */ + CONHASH_API struct node_s* conhash_lookup(struct conhash_s *conhash, const char *object, u_int length); + + /* some utility functions export*/ + CONHASH_API void conhash_md5_digest(const u_char *instr, u_int length, u_char digest[16]); + /* get virtual node number in the hash */ + CONHASH_API u_int conhash_get_vnodes_num(const struct conhash_s *conhash); + /* + * get virtual nodes in ascending oder + * @values, pointer to an array, stores all the nodes's hash value + * @size, how many nodes to get, can't be less than the array size + */ + CONHASH_API void conhash_get_vnodes(struct conhash_s *conhash, long *values, int size); + +#ifdef __cplusplus +} +#endif + +#endif /* end __CON_HASH_H_ */ diff --git a/lib/libconhash/conhash_inter.c b/lib/libconhash/conhash_inter.c new file mode 100644 index 0000000..b634bcd --- /dev/null +++ b/lib/libconhash/conhash_inter.c @@ -0,0 +1,118 @@ + +/* Copyright (C) 2010. sparkling.liang@hotmail.com. All rights reserved. */ + +#include "conhash_inter.h" +#include "conhash.h" + +/* + * the default hash function, using md5 algorithm + * @instr: input string + */ +long __conhash_hash_def(const char *instr, u_int length) +{ + int i; + long hash = 0; + unsigned char digest[16]; + conhash_md5_digest((const u_char*)instr, length, digest); + + /* use successive 4-bytes from hash as numbers */ + for(i = 0; i < 4; i++) + { + hash += ((long)(digest[i*4 + 3]&0xFF) << 24) + | ((long)(digest[i*4 + 2]&0xFF) << 16) + | ((long)(digest[i*4 + 1]&0xFF) << 8) + | ((long)(digest[i*4 + 0]&0xFF)); + } + return hash; +} + +void __conhash_node2string(const struct node_s *node, u_int replica_idx, char buf[128], u_int *len) +{ + (void)len; +#if (defined (WIN32) || defined (__WIN32)) + _snprintf_s(buf, 127, _TRUNCATE, "%s-%03d", node->iden, replica_idx); +#else + snprintf(buf, 127, "%s-%03d", node->iden, replica_idx); +#endif +} + +void __conhash_add_replicas(struct conhash_s *conhash, struct node_s *node) +{ + u_int i, len; + long hash; + char buff[128]; + util_rbtree_node_t *rbnode; + for(i = 0; i < node->replicas; i++) + { + /* calc hash value of all virtual nodes */ + __conhash_node2string(node, i, buff, &len); + hash = conhash->cb_hashfunc(buff, strlen(buff)); + /* add virtual node, check duplication */ + if(util_rbtree_search(&(conhash->vnode_tree), hash) == NULL) + { + rbnode = __conhash_get_rbnode(node, hash); + if(rbnode != NULL) + { + util_rbtree_insert(&(conhash->vnode_tree), rbnode); + conhash->ivnodes++; + } + } + } +} + +void __conhash_del_replicas(struct conhash_s *conhash, struct node_s *node) +{ + u_int i, len; + long hash; + char buff[128]; + struct virtual_node_s *vnode; + util_rbtree_node_t *rbnode; + for(i = 0; i < node->replicas; i++) + { + /* calc hash value of all virtual nodes */ + __conhash_node2string(node, i, buff, &len); + hash = conhash->cb_hashfunc(buff, strlen(buff)); + rbnode = util_rbtree_search(&(conhash->vnode_tree), hash); + if(rbnode != NULL) + { + vnode = rbnode->data; + if((vnode->hash == hash) && (vnode->node == node)) + { + conhash->ivnodes--; + util_rbtree_delete(&(conhash->vnode_tree), rbnode); + __conhash_del_rbnode(rbnode); + } + } + } +} + +util_rbtree_node_t *__conhash_get_rbnode(struct node_s *node, long hash) +{ + util_rbtree_node_t *rbnode; + rbnode = (util_rbtree_node_t *)malloc(sizeof(util_rbtree_node_t)); + if(rbnode != NULL) + { + rbnode->key = hash; + rbnode->data = malloc(sizeof(struct virtual_node_s)); + if(rbnode->data != NULL) + { + struct virtual_node_s *vnode = rbnode->data; + vnode->hash = hash; + vnode->node = node; + } + else + { + free(rbnode); + rbnode = NULL; + } + } + return rbnode; +} + +void __conhash_del_rbnode(util_rbtree_node_t *rbnode) +{ + struct virtual_node_s *node; + node = rbnode->data; + free(node); + free(rbnode); +} diff --git a/lib/libconhash/conhash_inter.h b/lib/libconhash/conhash_inter.h new file mode 100644 index 0000000..1c565f7 --- /dev/null +++ b/lib/libconhash/conhash_inter.h @@ -0,0 +1,49 @@ + +/* Copyright (C) 2010. sparkling.liang@hotmail.com. All rights reserved. */ + +#ifndef __CONHASH_INTER_H_ +#define __CONHASH_INTER_H_ + +#include "configure.h" +#include "md5.h" +#include "util_rbtree.h" + + +#include +#include +#include +#include + +/* virtual node structure */ +struct virtual_node_s +{ + long hash; + struct node_s *node; /* pointer to node */ +}; + +/* consistent hashing */ +struct conhash_s +{ + util_rbtree_t vnode_tree; /* rbtree of virtual nodes */ + u_int ivnodes; /* virtual node number */ + long (*cb_hashfunc)(const char *, u_int); +}; + +struct __get_vnodes_s +{ + long *values; + long size, cur; +}; + + +int __conhash_vnode_cmp(const void *v1, const void *v2); + +void __conhash_node2string(const struct node_s *node, u_int replica_idx, char buf[128], u_int *len); +long __conhash_hash_def(const char *instr, u_int length); +void __conhash_add_replicas(struct conhash_s *conhash, struct node_s *node); +void __conhash_del_replicas(struct conhash_s *conhash, struct node_s *node); + +util_rbtree_node_t *__conhash_get_rbnode(struct node_s *node, long hash); +void __conhash_del_rbnode(util_rbtree_node_t *rbnode); + +#endif /* end __CONHASH_INTER_H_ */ diff --git a/lib/libconhash/conhash_util.c b/lib/libconhash/conhash_util.c new file mode 100644 index 0000000..a14d494 --- /dev/null +++ b/lib/libconhash/conhash_util.c @@ -0,0 +1,45 @@ + +/* Copyright (C) 2010. sparkling.liang@hotmail.com. All rights reserved. */ + +#include "conhash.h" +#include "conhash_inter.h" + + +void conhash_md5_digest(const u_char *instr, u_int length, u_char digest[16]) +{ + md5_state_t md5state; + + md5_init(&md5state); + md5_append(&md5state, instr, length); + md5_finish(&md5state, digest); +} + +static void __get_vnodes(util_rbtree_node_t *node, void *data) +{ + struct __get_vnodes_s *vnodes = (struct __get_vnodes_s *)data; + if(vnodes->cur < vnodes->size) + { + vnodes->values[vnodes->cur++] = node->key; + } +} +void conhash_get_vnodes(struct conhash_s *conhash, long *values, int size) +{ + struct __get_vnodes_s vnodes; + if((conhash==NULL) || (values==NULL) || (size<=0)) + { + return; + } + vnodes.values = values; + vnodes.size = size; + vnodes.cur = 0; + util_rbtree_mid_travel(&(conhash->vnode_tree), __get_vnodes, &vnodes); +} + +u_int conhash_get_vnodes_num(const struct conhash_s *conhash) +{ + if(conhash == NULL) + { + return 0; + } + return conhash->ivnodes; +} diff --git a/lib/libconhash/md5.c b/lib/libconhash/md5.c new file mode 100644 index 0000000..c35d96c --- /dev/null +++ b/lib/libconhash/md5.c @@ -0,0 +1,381 @@ +/* + Copyright (C) 1999, 2000, 2002 Aladdin Enterprises. All rights reserved. + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + L. Peter Deutsch + ghost@aladdin.com + + */ +/* $Id: md5.c,v 1.6 2002/04/13 19:20:28 lpd Exp $ */ +/* + Independent implementation of MD5 (RFC 1321). + + This code implements the MD5 Algorithm defined in RFC 1321, whose + text is available at + http://www.ietf.org/rfc/rfc1321.txt + The code is derived from the text of the RFC, including the test suite + (section A.5) but excluding the rest of Appendix A. It does not include + any code or documentation that is identified in the RFC as being + copyrighted. + + The original and principal author of md5.c is L. Peter Deutsch + . Other authors are noted in the change history + that follows (in reverse chronological order): + + 2002-04-13 lpd Clarified derivation from RFC 1321; now handles byte order + either statically or dynamically; added missing #include + in library. + 2002-03-11 lpd Corrected argument list for main(), and added int return + type, in test program and T value program. + 2002-02-21 lpd Added missing #include in test program. + 2000-07-03 lpd Patched to eliminate warnings about "constant is + unsigned in ANSI C, signed in traditional"; made test program + self-checking. + 1999-11-04 lpd Edited comments slightly for automatic TOC extraction. + 1999-10-18 lpd Fixed typo in header comment (ansi2knr rather than md5). + 1999-05-03 lpd Original version. + */ + +#include "md5.h" +#include + +#undef BYTE_ORDER /* 1 = big-endian, -1 = little-endian, 0 = unknown */ +#ifdef ARCH_IS_BIG_ENDIAN +# define BYTE_ORDER (ARCH_IS_BIG_ENDIAN ? 1 : -1) +#else +# define BYTE_ORDER 0 +#endif + +#define T_MASK ((md5_word_t)~0) +#define T1 /* 0xd76aa478 */ (T_MASK ^ 0x28955b87) +#define T2 /* 0xe8c7b756 */ (T_MASK ^ 0x173848a9) +#define T3 0x242070db +#define T4 /* 0xc1bdceee */ (T_MASK ^ 0x3e423111) +#define T5 /* 0xf57c0faf */ (T_MASK ^ 0x0a83f050) +#define T6 0x4787c62a +#define T7 /* 0xa8304613 */ (T_MASK ^ 0x57cfb9ec) +#define T8 /* 0xfd469501 */ (T_MASK ^ 0x02b96afe) +#define T9 0x698098d8 +#define T10 /* 0x8b44f7af */ (T_MASK ^ 0x74bb0850) +#define T11 /* 0xffff5bb1 */ (T_MASK ^ 0x0000a44e) +#define T12 /* 0x895cd7be */ (T_MASK ^ 0x76a32841) +#define T13 0x6b901122 +#define T14 /* 0xfd987193 */ (T_MASK ^ 0x02678e6c) +#define T15 /* 0xa679438e */ (T_MASK ^ 0x5986bc71) +#define T16 0x49b40821 +#define T17 /* 0xf61e2562 */ (T_MASK ^ 0x09e1da9d) +#define T18 /* 0xc040b340 */ (T_MASK ^ 0x3fbf4cbf) +#define T19 0x265e5a51 +#define T20 /* 0xe9b6c7aa */ (T_MASK ^ 0x16493855) +#define T21 /* 0xd62f105d */ (T_MASK ^ 0x29d0efa2) +#define T22 0x02441453 +#define T23 /* 0xd8a1e681 */ (T_MASK ^ 0x275e197e) +#define T24 /* 0xe7d3fbc8 */ (T_MASK ^ 0x182c0437) +#define T25 0x21e1cde6 +#define T26 /* 0xc33707d6 */ (T_MASK ^ 0x3cc8f829) +#define T27 /* 0xf4d50d87 */ (T_MASK ^ 0x0b2af278) +#define T28 0x455a14ed +#define T29 /* 0xa9e3e905 */ (T_MASK ^ 0x561c16fa) +#define T30 /* 0xfcefa3f8 */ (T_MASK ^ 0x03105c07) +#define T31 0x676f02d9 +#define T32 /* 0x8d2a4c8a */ (T_MASK ^ 0x72d5b375) +#define T33 /* 0xfffa3942 */ (T_MASK ^ 0x0005c6bd) +#define T34 /* 0x8771f681 */ (T_MASK ^ 0x788e097e) +#define T35 0x6d9d6122 +#define T36 /* 0xfde5380c */ (T_MASK ^ 0x021ac7f3) +#define T37 /* 0xa4beea44 */ (T_MASK ^ 0x5b4115bb) +#define T38 0x4bdecfa9 +#define T39 /* 0xf6bb4b60 */ (T_MASK ^ 0x0944b49f) +#define T40 /* 0xbebfbc70 */ (T_MASK ^ 0x4140438f) +#define T41 0x289b7ec6 +#define T42 /* 0xeaa127fa */ (T_MASK ^ 0x155ed805) +#define T43 /* 0xd4ef3085 */ (T_MASK ^ 0x2b10cf7a) +#define T44 0x04881d05 +#define T45 /* 0xd9d4d039 */ (T_MASK ^ 0x262b2fc6) +#define T46 /* 0xe6db99e5 */ (T_MASK ^ 0x1924661a) +#define T47 0x1fa27cf8 +#define T48 /* 0xc4ac5665 */ (T_MASK ^ 0x3b53a99a) +#define T49 /* 0xf4292244 */ (T_MASK ^ 0x0bd6ddbb) +#define T50 0x432aff97 +#define T51 /* 0xab9423a7 */ (T_MASK ^ 0x546bdc58) +#define T52 /* 0xfc93a039 */ (T_MASK ^ 0x036c5fc6) +#define T53 0x655b59c3 +#define T54 /* 0x8f0ccc92 */ (T_MASK ^ 0x70f3336d) +#define T55 /* 0xffeff47d */ (T_MASK ^ 0x00100b82) +#define T56 /* 0x85845dd1 */ (T_MASK ^ 0x7a7ba22e) +#define T57 0x6fa87e4f +#define T58 /* 0xfe2ce6e0 */ (T_MASK ^ 0x01d3191f) +#define T59 /* 0xa3014314 */ (T_MASK ^ 0x5cfebceb) +#define T60 0x4e0811a1 +#define T61 /* 0xf7537e82 */ (T_MASK ^ 0x08ac817d) +#define T62 /* 0xbd3af235 */ (T_MASK ^ 0x42c50dca) +#define T63 0x2ad7d2bb +#define T64 /* 0xeb86d391 */ (T_MASK ^ 0x14792c6e) + + +static void +md5_process(md5_state_t *pms, const md5_byte_t *data /*[64]*/) +{ + md5_word_t + a = pms->abcd[0], b = pms->abcd[1], + c = pms->abcd[2], d = pms->abcd[3]; + md5_word_t t; +#if BYTE_ORDER > 0 + /* Define storage only for big-endian CPUs. */ + md5_word_t X[16]; +#else + /* Define storage for little-endian or both types of CPUs. */ + md5_word_t xbuf[16]; + const md5_word_t *X; +#endif + + { +#if BYTE_ORDER == 0 + /* + * Determine dynamically whether this is a big-endian or + * little-endian machine, since we can use a more efficient + * algorithm on the latter. + */ + static const int w = 1; + + if (*((const md5_byte_t *)&w)) /* dynamic little-endian */ +#endif +#if BYTE_ORDER <= 0 /* little-endian */ + { + /* + * On little-endian machines, we can process properly aligned + * data without copying it. + */ + if (!((data - (const md5_byte_t *)0) & 3)) { + /* data are properly aligned */ + X = (const md5_word_t *)data; + } else { + /* not aligned */ + memcpy(xbuf, data, 64); + X = xbuf; + } + } +#endif +#if BYTE_ORDER == 0 + else /* dynamic big-endian */ +#endif +#if BYTE_ORDER >= 0 /* big-endian */ + { + /* + * On big-endian machines, we must arrange the bytes in the + * right order. + */ + const md5_byte_t *xp = data; + int i; + +# if BYTE_ORDER == 0 + X = xbuf; /* (dynamic only) */ +# else +# define xbuf X /* (static only) */ +# endif + for (i = 0; i < 16; ++i, xp += 4) + xbuf[i] = xp[0] + (xp[1] << 8) + (xp[2] << 16) + (xp[3] << 24); + } +#endif + } + +#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) + + /* Round 1. */ + /* Let [abcd k s i] denote the operation + a = b + ((a + F(b,c,d) + X[k] + T[i]) <<< s). */ +#define F(x, y, z) (((x) & (y)) | (~(x) & (z))) +#define SET(a, b, c, d, k, s, Ti)\ + t = a + F(b,c,d) + X[k] + Ti;\ + a = ROTATE_LEFT(t, s) + b + /* Do the following 16 operations. */ + SET(a, b, c, d, 0, 7, T1); + SET(d, a, b, c, 1, 12, T2); + SET(c, d, a, b, 2, 17, T3); + SET(b, c, d, a, 3, 22, T4); + SET(a, b, c, d, 4, 7, T5); + SET(d, a, b, c, 5, 12, T6); + SET(c, d, a, b, 6, 17, T7); + SET(b, c, d, a, 7, 22, T8); + SET(a, b, c, d, 8, 7, T9); + SET(d, a, b, c, 9, 12, T10); + SET(c, d, a, b, 10, 17, T11); + SET(b, c, d, a, 11, 22, T12); + SET(a, b, c, d, 12, 7, T13); + SET(d, a, b, c, 13, 12, T14); + SET(c, d, a, b, 14, 17, T15); + SET(b, c, d, a, 15, 22, T16); +#undef SET + + /* Round 2. */ + /* Let [abcd k s i] denote the operation + a = b + ((a + G(b,c,d) + X[k] + T[i]) <<< s). */ +#define G(x, y, z) (((x) & (z)) | ((y) & ~(z))) +#define SET(a, b, c, d, k, s, Ti)\ + t = a + G(b,c,d) + X[k] + Ti;\ + a = ROTATE_LEFT(t, s) + b + /* Do the following 16 operations. */ + SET(a, b, c, d, 1, 5, T17); + SET(d, a, b, c, 6, 9, T18); + SET(c, d, a, b, 11, 14, T19); + SET(b, c, d, a, 0, 20, T20); + SET(a, b, c, d, 5, 5, T21); + SET(d, a, b, c, 10, 9, T22); + SET(c, d, a, b, 15, 14, T23); + SET(b, c, d, a, 4, 20, T24); + SET(a, b, c, d, 9, 5, T25); + SET(d, a, b, c, 14, 9, T26); + SET(c, d, a, b, 3, 14, T27); + SET(b, c, d, a, 8, 20, T28); + SET(a, b, c, d, 13, 5, T29); + SET(d, a, b, c, 2, 9, T30); + SET(c, d, a, b, 7, 14, T31); + SET(b, c, d, a, 12, 20, T32); +#undef SET + + /* Round 3. */ + /* Let [abcd k s t] denote the operation + a = b + ((a + H(b,c,d) + X[k] + T[i]) <<< s). */ +#define H(x, y, z) ((x) ^ (y) ^ (z)) +#define SET(a, b, c, d, k, s, Ti)\ + t = a + H(b,c,d) + X[k] + Ti;\ + a = ROTATE_LEFT(t, s) + b + /* Do the following 16 operations. */ + SET(a, b, c, d, 5, 4, T33); + SET(d, a, b, c, 8, 11, T34); + SET(c, d, a, b, 11, 16, T35); + SET(b, c, d, a, 14, 23, T36); + SET(a, b, c, d, 1, 4, T37); + SET(d, a, b, c, 4, 11, T38); + SET(c, d, a, b, 7, 16, T39); + SET(b, c, d, a, 10, 23, T40); + SET(a, b, c, d, 13, 4, T41); + SET(d, a, b, c, 0, 11, T42); + SET(c, d, a, b, 3, 16, T43); + SET(b, c, d, a, 6, 23, T44); + SET(a, b, c, d, 9, 4, T45); + SET(d, a, b, c, 12, 11, T46); + SET(c, d, a, b, 15, 16, T47); + SET(b, c, d, a, 2, 23, T48); +#undef SET + + /* Round 4. */ + /* Let [abcd k s t] denote the operation + a = b + ((a + I(b,c,d) + X[k] + T[i]) <<< s). */ +#define I(x, y, z) ((y) ^ ((x) | ~(z))) +#define SET(a, b, c, d, k, s, Ti)\ + t = a + I(b,c,d) + X[k] + Ti;\ + a = ROTATE_LEFT(t, s) + b + /* Do the following 16 operations. */ + SET(a, b, c, d, 0, 6, T49); + SET(d, a, b, c, 7, 10, T50); + SET(c, d, a, b, 14, 15, T51); + SET(b, c, d, a, 5, 21, T52); + SET(a, b, c, d, 12, 6, T53); + SET(d, a, b, c, 3, 10, T54); + SET(c, d, a, b, 10, 15, T55); + SET(b, c, d, a, 1, 21, T56); + SET(a, b, c, d, 8, 6, T57); + SET(d, a, b, c, 15, 10, T58); + SET(c, d, a, b, 6, 15, T59); + SET(b, c, d, a, 13, 21, T60); + SET(a, b, c, d, 4, 6, T61); + SET(d, a, b, c, 11, 10, T62); + SET(c, d, a, b, 2, 15, T63); + SET(b, c, d, a, 9, 21, T64); +#undef SET + + /* Then perform the following additions. (That is increment each + of the four registers by the value it had before this block + was started.) */ + pms->abcd[0] += a; + pms->abcd[1] += b; + pms->abcd[2] += c; + pms->abcd[3] += d; +} + +void +md5_init(md5_state_t *pms) +{ + pms->count[0] = pms->count[1] = 0; + pms->abcd[0] = 0x67452301; + pms->abcd[1] = /*0xefcdab89*/ T_MASK ^ 0x10325476; + pms->abcd[2] = /*0x98badcfe*/ T_MASK ^ 0x67452301; + pms->abcd[3] = 0x10325476; +} + +void +md5_append(md5_state_t *pms, const md5_byte_t *data, int nbytes) +{ + const md5_byte_t *p = data; + int left = nbytes; + int offset = (pms->count[0] >> 3) & 63; + md5_word_t nbits = (md5_word_t)(nbytes << 3); + + if (nbytes <= 0) + return; + + /* Update the message length. */ + pms->count[1] += nbytes >> 29; + pms->count[0] += nbits; + if (pms->count[0] < nbits) + pms->count[1]++; + + /* Process an initial partial block. */ + if (offset) { + int copy = (offset + nbytes > 64 ? 64 - offset : nbytes); + + memcpy(pms->buf + offset, p, copy); + if (offset + copy < 64) + return; + p += copy; + left -= copy; + md5_process(pms, pms->buf); + } + + /* Process full blocks. */ + for (; left >= 64; p += 64, left -= 64) + md5_process(pms, p); + + /* Process a final partial block. */ + if (left) + memcpy(pms->buf, p, left); +} + +void +md5_finish(md5_state_t *pms, md5_byte_t digest[16]) +{ + static const md5_byte_t pad[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + md5_byte_t data[8]; + int i; + + /* Save the length before padding. */ + for (i = 0; i < 8; ++i) + data[i] = (md5_byte_t)(pms->count[i >> 2] >> ((i & 3) << 3)); + /* Pad to 56 bytes mod 64. */ + md5_append(pms, pad, ((55 - (pms->count[0] >> 3)) & 63) + 1); + /* Append the length. */ + md5_append(pms, data, 8); + for (i = 0; i < 16; ++i) + digest[i] = (md5_byte_t)(pms->abcd[i >> 2] >> ((i & 3) << 3)); +} diff --git a/lib/libconhash/md5.h b/lib/libconhash/md5.h new file mode 100644 index 0000000..d9a45cb --- /dev/null +++ b/lib/libconhash/md5.h @@ -0,0 +1,101 @@ +/* + Copyright (C) 1999, 2002 Aladdin Enterprises. All rights reserved. + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + L. Peter Deutsch + ghost@aladdin.com + + */ +/* $Id: md5.h,v 1.4 2002/04/13 19:20:28 lpd Exp $ */ +/* + Independent implementation of MD5 (RFC 1321). + + This code implements the MD5 Algorithm defined in RFC 1321, whose + text is available at + http://www.ietf.org/rfc/rfc1321.txt + The code is derived from the text of the RFC, including the test suite + (section A.5) but excluding the rest of Appendix A. It does not include + any code or documentation that is identified in the RFC as being + copyrighted. + + The original and principal author of md5.h is L. Peter Deutsch + . Other authors are noted in the change history + that follows (in reverse chronological order): + + 2002-04-13 lpd Removed support for non-ANSI compilers; removed + references to Ghostscript; clarified derivation from RFC 1321; + now handles byte order either statically or dynamically. + 1999-11-04 lpd Edited comments slightly for automatic TOC extraction. + 1999-10-18 lpd Fixed typo in header comment (ansi2knr rather than md5); + added conditionalization for C++ compilation from Martin + Purschke . + 1999-05-03 lpd Original version. + */ + +#ifndef md5_INCLUDED +# define md5_INCLUDED + +/* + * This package supports both compile-time and run-time determination of CPU + * byte order. If ARCH_IS_BIG_ENDIAN is defined as 0, the code will be + * compiled to run only on little-endian CPUs; if ARCH_IS_BIG_ENDIAN is + * defined as non-zero, the code will be compiled to run only on big-endian + * CPUs; if ARCH_IS_BIG_ENDIAN is not defined, the code will be compiled to + * run on either big- or little-endian CPUs, but will run slightly less + * efficiently on either one than if ARCH_IS_BIG_ENDIAN is defined. + */ + +typedef unsigned char md5_byte_t; /* 8-bit byte */ +typedef unsigned int md5_word_t; /* 32-bit word */ + +/* Define the state of the MD5 Algorithm. */ +typedef struct md5_state_s { + md5_word_t count[2]; /* message length in bits, lsw first */ + md5_word_t abcd[4]; /* digest buffer */ + md5_byte_t buf[64]; /* accumulate block */ +} md5_state_t; + +#ifdef __cplusplus +extern "C" +{ +#endif + +/* Initialize the algorithm. */ + +#ifdef WIN32 +_declspec(dllexport) +#endif +void md5_init(md5_state_t *pms); + +/* Append a string to the message. */ +#ifdef WIN32 +_declspec(dllexport) +#endif +void md5_append(md5_state_t *pms, const md5_byte_t *data, int nbytes); + +/* Finish the message and return the digest. */ +#ifdef WIN32 +_declspec(dllexport) +#endif +void md5_finish(md5_state_t *pms, md5_byte_t digest[16]); + +#ifdef __cplusplus +} /* end extern "C" */ +#endif + +#endif /* md5_INCLUDED */ diff --git a/lib/libconhash/sample.c b/lib/libconhash/sample.c new file mode 100644 index 0000000..678127a --- /dev/null +++ b/lib/libconhash/sample.c @@ -0,0 +1,57 @@ + + +#include +#include + +#include "conhash.h" + +struct node_s g_nodes[64]; +int main() +{ + int i; + const struct node_s *node; + char str[128]; + long hashes[512]; + + /* init conhash instance */ + struct conhash_s *conhash = conhash_init(NULL); + if(conhash) + { + /* set nodes */ + conhash_set_node(&g_nodes[0], "titanic", 32); + conhash_set_node(&g_nodes[1], "terminator2018", 24); + conhash_set_node(&g_nodes[2], "Xenomorph", 25); + conhash_set_node(&g_nodes[3], "True Lies", 10); + conhash_set_node(&g_nodes[4], "avantar", 48); + + /* add nodes */ + conhash_add_node(conhash, &g_nodes[0]); + conhash_add_node(conhash, &g_nodes[1]); + conhash_add_node(conhash, &g_nodes[2]); + conhash_add_node(conhash, &g_nodes[3]); + conhash_add_node(conhash, &g_nodes[4]); + + printf("virtual nodes number %d\n", conhash_get_vnodes_num(conhash)); + printf("the hashing results--------------------------------------:\n"); + + /* try object */ + for(i = 0; i < 20; i++) + { + sprintf(str, "James.km%03d", i); + node = conhash_lookup(conhash, str, NULL); + if(node) printf("[%16s] is in node: [%16s]\n", str, node->iden); + } + conhash_get_vnodes(conhash, hashes, sizeof(hashes)/sizeof(hashes[0])); + conhash_del_node(conhash, &g_nodes[2]); + printf("remove node[%s], virtual nodes number %d\n", g_nodes[2].iden, conhash_get_vnodes_num(conhash)); + printf("the hashing results--------------------------------------:\n"); + for(i = 0; i < 20; i++) + { + sprintf(str, "James.km%03d", i); + node = conhash_lookup(conhash, str, NULL); + if(node) printf("[%16s] is in node: [%16s]\n", str, node->iden); + } + } + conhash_fini(conhash); + return 0; +} diff --git a/lib/libconhash/util_rbtree.c b/lib/libconhash/util_rbtree.c new file mode 100644 index 0000000..164773f --- /dev/null +++ b/lib/libconhash/util_rbtree.c @@ -0,0 +1,552 @@ + +/* Copyright (C) 2010. sparkling.liang@hotmail.com. All rights reserved. */ + +#include "util_rbtree.h" + +/* the NULL node of tree */ +#define _NULL(rbtree) (&((rbtree)->null)) + +/* structues uesed to check a rb tree */ +struct rbtree_check_s +{ + short rbh; /* rb height of the tree */ + short maxd; /* max depth of the tree */ + int fini; /* check failed ? */ + const util_rbtree_node_t *null; /* sentinel of the tree */ +}; + +typedef struct rbtree_check_s rbtree_check_t; + +static void rbtree_insert_fixup(util_rbtree_t *rbtree, util_rbtree_node_t *node); +static void rbtree_delete_fixup(util_rbtree_t *rbtree, util_rbtree_node_t *node); +static void rbtree_left_rotate(util_rbtree_t *rbtree, util_rbtree_node_t *node); +static void rbtree_right_rotate(util_rbtree_t *rbtree, util_rbtree_node_t *node); + +void util_rbtree_init(util_rbtree_t *rbtree) +{ + if(rbtree != NULL) + { + util_rbt_black(_NULL(rbtree)); /* null MUST be black */ + rbtree->root = _NULL(rbtree); + rbtree->size = 0; + } +} + +util_rbtree_node_t* util_rbsubtree_min(util_rbtree_node_t *node, util_rbtree_node_t *sentinel) +{ + if(node == sentinel) return NULL; + while(node->left != sentinel) node = node->left; + return node; +} + +util_rbtree_node_t* util_rbsubtree_max(util_rbtree_node_t *node, util_rbtree_node_t *sentinel) +{ + if(node == sentinel) return NULL; + while(node->right != sentinel) node = node->right; + return node; +} + +void util_rbtree_insert(util_rbtree_t *rbtree, util_rbtree_node_t *node) +{ + util_rbtree_node_t *x, *y; + if((rbtree==NULL) || (node==NULL) || (node==_NULL(rbtree))) + { + return; + } + /* the tree is empty */ + if(rbtree->root == _NULL(rbtree)) + { + rbtree->root = node; + node->parent = _NULL(rbtree); + } + else /* find the insert position */ + { + x = rbtree->root; + while(x != _NULL(rbtree)) + { + y = x; + if(node->key < x->key) x = x->left; + else x = x->right; + } + /* now y is node's parent */ + node->parent = y; + if(node->key < y->key) y->left = node; + else y->right = node; + } + + /* initialize node's link & color */ + node->left = _NULL(rbtree); + node->right = _NULL(rbtree); + util_rbt_red(node); + /* fix up insert */ + rbtree_insert_fixup(rbtree, node); + rbtree->size++; +} + +/* insert may violate the rbtree properties, fix up the tree */ +void rbtree_insert_fixup(util_rbtree_t *rbtree, util_rbtree_node_t *node) +{ + util_rbtree_node_t *p, *u; /* u is the uncle node of node */ + while(util_rbt_isred(node->parent)) + { + p = node->parent; + if(p == p->parent->left) /* parent is the left child */ + { + u = p->parent->right; + if(util_rbt_isred(u)) /* case 1: p & u are red */ + { + util_rbt_black(u); + util_rbt_black(p); + util_rbt_red(p->parent); + node = p->parent; + } + else + { + if(node == p->right) /* case 2: p:read, u:black, node is right child */ + { + node = p; + rbtree_left_rotate(rbtree, node); + p = node->parent; + } + /* case 3: p:read, u:black, node is left child */ + util_rbt_black(p); + util_rbt_red(p->parent); + rbtree_right_rotate(rbtree, p->parent); + } + } + else /* parent is the right child */ + { + u = p->parent->left; + if(util_rbt_isred(u)) + { + util_rbt_black(u); + util_rbt_black(p); + util_rbt_red(p->parent); + node = p->parent; + } + else + { + if(p->left == node) + { + node = p; + rbtree_right_rotate(rbtree, node); + p = node->parent; + } + util_rbt_black(p); + util_rbt_red(p->parent); + rbtree_left_rotate(rbtree, p->parent); + } + } + } + /* mark root to black */ + util_rbt_black(rbtree->root); +} + + +void util_rbtree_delete(util_rbtree_t *rbtree, util_rbtree_node_t *node) +{ + int isblack; + util_rbtree_node_t *temp, *subst; + if((rbtree==NULL) || (node==NULL) || (node==_NULL(rbtree))) + { + return; + } + rbtree->size--; + /* find deleted position, indicated by temp */ + if(node->left == _NULL(rbtree)) + { + temp = node; + subst = node->right; + } + else if(node->right == _NULL(rbtree)) + { + temp = node; + subst = node->left; + } + else /* right & left aren't null */ + { + temp = util_rbsubtree_min(node->right, _NULL(rbtree)); + if(temp->left != _NULL(rbtree)) + { + subst = temp->left; + } + else + { + subst = temp->right; + } + } + if(temp == rbtree->root) /* temp is root */ + { + rbtree->root = subst; + util_rbt_black(subst); + rbt_clear_node(temp); + return; + } + isblack = util_rbt_isblack(temp); + /* temp will be removed from it's position, rebuild links + * NOTE: if temp->parent = node, then subst->parent is node + * while node is the one to be delete, so relink subst's parent to temp + * because temp will replace node's in the tree + */ + if(temp->parent == node) + { + subst->parent = temp; + } + else + { + subst->parent = temp->parent; + } + + if(temp == temp->parent->left) + { + temp->parent->left = subst; + } + else + { + temp->parent->right = subst; + } + /* + * now temp is removed from the tree. + * so we will make temp to replace node in the tree. + */ + if(temp != node) + { + temp->parent = node->parent; + if(node == rbtree->root) /* node maybe root */ + { + rbtree->root = temp; + } + else + { + if(node->parent->left == node) + { + node->parent->left = temp; + } + else + { + node->parent->right = temp; + } + } + temp->right = node->right; + temp->left = node->left; + if(temp->left != _NULL(rbtree)) + { + temp->left->parent = temp; + } + if(temp->right != _NULL(rbtree)) + { + temp->right->parent = temp; + } + temp->color = node->color; + } + rbt_clear_node(node); + + if(isblack) + { + /* temp is black, fix up delete */ + rbtree_delete_fixup(rbtree, subst); + } +} + +/* delete may violate the rbtree properties, fix up the tree */ +void rbtree_delete_fixup(util_rbtree_t *rbtree, util_rbtree_node_t *node) +{ + int h = 0; + util_rbtree_node_t *w; + while((node != rbtree->root) && util_rbt_isblack(node)) + { + h++; + if(node == node->parent->left) /* node is left child */ + { + w = node->parent->right; + if(util_rbt_isred(w)) + { + util_rbt_black(w); + util_rbt_red(node->parent); + rbtree_left_rotate(rbtree, node->parent); + w = node->parent->right; + } + if(util_rbt_isblack(w->left) && util_rbt_isblack(w->right)) + { + util_rbt_red(w); + node = node->parent; + } + else + { + if(util_rbt_isblack(w->right)) + { + util_rbt_black(w->left); + util_rbt_red(w); + rbtree_right_rotate(rbtree, w); + w = node->parent->right; + } + w->color = node->parent->color; + util_rbt_black(node->parent); + util_rbt_black(w->right); + rbtree_left_rotate(rbtree, node->parent); + node = rbtree->root; /* to break loop */ + } + } + else /* node is right child */ + { + w = node->parent->left; + if(w == 0) + { +// int t = 4; + exit(0); + } + if(util_rbt_isred(w)) + { + util_rbt_black(w); + util_rbt_red(node->parent); + rbtree_right_rotate(rbtree, node->parent); + w = node->parent->left; + } + if(util_rbt_isblack(w->left) && util_rbt_isblack(w->right)) + { + util_rbt_red(w); + node = node->parent; + } + else + { + if(util_rbt_isblack(w->left)) + { + util_rbt_black(w->right); + util_rbt_red(w); + rbtree_left_rotate(rbtree, w); + w = node->parent->left; + } + w->color = node->parent->color; + util_rbt_black(node->parent); + util_rbt_black(w->left); + rbtree_right_rotate(rbtree, node->parent); + node = rbtree->root; /* to break loop */ + } + } + } + util_rbt_black(node); +} + +void rbtree_left_rotate(util_rbtree_t *rbtree, util_rbtree_node_t *node) +{ + util_rbtree_node_t *rc = node->right; + util_rbtree_node_t *rclc = rc->left; + /* make rc to replace node's position */ + rc->parent = node->parent; + if(node == rbtree->root) + { + rbtree->root = rc; + } + else + { + if(node->parent->left == node) /* node is left child */ + { + node->parent->left = rc; + } + else + { + node->parent->right = rc; + } + } + /* make node to be rc's left child */ + node->parent = rc; + rc->left = node; + /* rc's left child to be node's right child */ + node->right = rclc; + if(rclc != _NULL(rbtree)) + { + rclc->parent = node; + } +} + +void rbtree_right_rotate(util_rbtree_t *rbtree, util_rbtree_node_t *node) +{ + util_rbtree_node_t *lc = node->left; + util_rbtree_node_t *lcrc = lc->right; + /* make lc to replace node's position */ + lc->parent = node->parent; + if(node == rbtree->root) + { + rbtree->root = lc; + } + else + { + if(node->parent->left == node) /* node is left child */ + { + node->parent->left = lc; + } + else + { + node->parent->right = lc; + } + } + /* make node to be lc's right child */ + lc->right = node; + node->parent = lc; + /* lc's right child to be node's left child */ + node->left = lcrc; + if(lcrc != _NULL(rbtree)) + { + lcrc->parent = node; + } +} + +util_rbtree_node_t* util_rbtree_search(util_rbtree_t *rbtree, long key) +{ + if(rbtree != NULL) + { + util_rbtree_node_t *node = rbtree->root; + util_rbtree_node_t *null = _NULL(rbtree); + while(node != null) + { + if(key < node->key) node = node->left; + else if(key > node->key) node = node->right; + else if(node->key == key) return node; + } + } + return NULL; +} + +util_rbtree_node_t* util_rbtree_lookup(util_rbtree_t *rbtree, long key) +{ + if((rbtree != NULL) && !util_rbtree_isempty(rbtree)) + { + util_rbtree_node_t *node = NULL; + util_rbtree_node_t *temp = rbtree->root; + util_rbtree_node_t *null = _NULL(rbtree); + while(temp != null) + { + if(key <= temp->key) + { + node = temp; /* update node */ + temp = temp->left; + } + else if(key > temp->key) + { + temp = temp->right; + } + } + /* if node==NULL return the minimum node */ + return ((node != NULL) ? node : util_rbtree_min(rbtree)); + } + return NULL; +} + +static void rbtree_check_subtree(const util_rbtree_node_t *node, rbtree_check_t *check, + int level, int curheight) +{ + if(check->fini) /* already failed */ + { + return; + } + /* check node color */ + if(util_rbt_isblack(node)) + { + curheight++; + } + else if(!util_rbt_isred(node)) + { + check->fini = 2; + return; + } + /* check left */ + if(node->left != check->null) + { + if(util_rbt_isred(node) && util_rbt_isred(node->left)) + { + check->fini = 4; + return; + } + if(node->key < node->left->key) + { + check->fini = 5; + return; + } + rbtree_check_subtree(node->left, check, level+1, curheight); + } + else + { + goto __check_rb_height; + } + /* check right */ + if(node->right != check->null) + { + if(util_rbt_isred(node) && util_rbt_isred(node->right)) + { + check->fini = 4; + return; + } + if(node->key > node->right->key) + { + check->fini = 5; + return; + } + rbtree_check_subtree(node->right, check, level+1, curheight); + } + else + { + goto __check_rb_height; + } + return; +__check_rb_height: + if(check->rbh == 0) + { + check->rbh = curheight; + } + if(check->maxd < level) + { + check->maxd = level; + } + if(check->rbh != curheight) + { + check->fini = 3; + } +} + +int util_rbtree_check(const util_rbtree_t *rbtree, int *blackheight, int *maxdepth) +{ + rbtree_check_t check; + if(rbtree->root == _NULL(rbtree)) + { + return 0; + } + if(!util_rbt_isblack(rbtree->root)) + { + return 1; + } + check.fini = check.maxd = check.rbh = 0; + check.null = _NULL(rbtree); + rbtree_check_subtree(rbtree->root, &check, 1, 0); + if(blackheight) + { + *blackheight = check.rbh; + } + if(maxdepth) + { + *maxdepth = check.maxd; + } + return check.fini; +} + +static void rbtree_mid_travel(util_rbtree_node_t *node, util_rbtree_node_t *sentinel, + void(*opera)(util_rbtree_node_t *, void *), void *data) +{ + if(node->left != sentinel) + { + rbtree_mid_travel(node->left, sentinel, opera, data); + } + opera(node, data); + if(node->right != sentinel) + { + rbtree_mid_travel(node->right, sentinel, opera, data); + } +} + +void util_rbtree_mid_travel(util_rbtree_t *rbtree, + void(*opera)(util_rbtree_node_t *, void *), void *data) +{ + if((rbtree!=NULL) && !util_rbtree_isempty(rbtree)) + { + rbtree_mid_travel(rbtree->root, _NULL(rbtree), opera, data); + } +} diff --git a/lib/libconhash/util_rbtree.h b/lib/libconhash/util_rbtree.h new file mode 100644 index 0000000..8c8c419 --- /dev/null +++ b/lib/libconhash/util_rbtree.h @@ -0,0 +1,114 @@ + +/* Copyright (C) 2010. sparkling.liang@hotmail.com. All rights reserved. */ + +#ifndef __UTIL_RLTREE_H_ +#define __UTIL_RLTREE_H_ + +#include "configure.h" +#include + +typedef struct util_rbtree_s util_rbtree_t; +typedef struct util_rbtree_node_s util_rbtree_node_t; + +struct util_rbtree_node_s +{ + long key; + util_rbtree_node_t *parent; + util_rbtree_node_t *right; + util_rbtree_node_t *left; + int color; + void *data; +}; + +struct util_rbtree_s +{ + util_rbtree_node_t *root; + util_rbtree_node_t null; + u_int size; +}; + + +#define util_rbt_black(rbnode) ((rbnode)->color = 1) +#define util_rbt_red(rbnode) ((rbnode)->color = 0) +#define util_rbt_isblack(rbnode) ((rbnode)->color == 1) +#define util_rbt_isred(rbnode) ((rbnode)->color == 0) + +/* clear a node's link */ +#define rbt_clear_node(node) do{ \ + node->left = NULL; \ + node->right = NULL; \ + node->parent = NULL; \ + }while(0) + +/* is the tree empty */ +#define util_rbtree_isempty(rbtree) ((rbtree)->root == &(rbtree)->null) + +/* + * find the min node of tree + * return NULL is tree is empty + */ +#define util_rbtree_min(rbtree) util_rbsubtree_min((rbtree)->root, &(rbtree)->null) + +/* + * find the max node of tree + * return NULL is tree is empty + */ +#define util_rbtree_max(rbtree) util_rbsubtree_max((rbtree)->root, &(rbtree)->null) + +void util_rbtree_init(util_rbtree_t *rbtree); +void util_rbtree_insert(util_rbtree_t *rbtree, util_rbtree_node_t *node); +void util_rbtree_delete(util_rbtree_t *rbtree, util_rbtree_node_t *node); + +/* + * search node with key = @key in the tree + * if no such node exist, return NULL + */ +util_rbtree_node_t* util_rbtree_search(util_rbtree_t *rbtree, long key); + +/* + * look node in the tree + * return the first node with key >= @key; + * if @key > all the key values in the tree, return the node with minimum key + * return NULL if tree is empty + */ +util_rbtree_node_t* util_rbtree_lookup(util_rbtree_t *rbtree, long key); + +/* + * find the min node of subtree + * @rbnode: root of the subtree + * @sentinel : the sentinel node + * return NULL if subtree is empty + */ +util_rbtree_node_t* util_rbsubtree_min(util_rbtree_node_t *node, util_rbtree_node_t *sentinel); + +/* + * find the max node of subtree + * @rbnode: root of the subtree + * @sentinel : the sentinel node + * return NULL if subtree is empty + */ +util_rbtree_node_t* util_rbsubtree_max(util_rbtree_node_t *node, util_rbtree_node_t *sentinel); + +/* + * check whether a tree is a rb tree, the null node is n't checked + * return 0: yes + * return 1: root isn't black + * return 2: node is in other color than black and red + * return 3: tree's black height isn't unique + * return 4: a red node with parent in red exists + * return 5: volatile binary search properties + * + * when return !0, @blackheight & @maxdepth is uselsess + * when return 0, @blackheight contains the tree's black height + * + * @maxdepth contains the max length of all simple roads from root to it's leaf nodes + */ +int util_rbtree_check(const util_rbtree_t *rbtree, int *blackheight, int *maxdepth); + +/* + * travel through a rb tree in sequence: left-root-right + * you CAN NOT do any operations that will break the RB properties + */ +void util_rbtree_mid_travel(util_rbtree_t *rbtree, void(*opera)(util_rbtree_node_t *, void *), void *data); + +#endif /* end __UTIL_RLTREE_H_ */