forked from Lisp-Stat/data-frame
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata-frame-column.lisp
105 lines (93 loc) · 4.67 KB
/
data-frame-column.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
;;; -*- Mode: LISP; Base: 10; Syntax: ANSI-Common-lisp; Package: DATA-FRAME.COLUMN -*-
;;; Copyright (c) 2020 by Symbolics Pte. Ltd. All rights reserved.
(cl:in-package :data-frame.column)
(defgeneric column-length (column)
(:documentation "Return the length of column.")
(:method ((column vector))
(length column)))
(defstruct vector-summary%
"Base class for summarizing vectors. Not exported."
(length 0 :type array-index :read-only t))
(defun print-count-and-percentage (stream count length)
"Print COUNT as is and also as a rounded percentage of "
(format stream "~D (~D%)" count (round (/ count length) 1/100)))
(defstruct (bit-vector-summary (:include vector-summary%))
"Summary of a bit vector."
(count 0 :type array-index :read-only t))
(defmethod print-object ((summary bit-vector-summary) stream)
(let+ (((&structure-r/o bit-vector-summary- length count) summary))
(princ "bits, ones: " stream)
(print-count-and-percentage stream count length)))
(defstruct quantiles-summary
"Summary of a real elements (using quantiles)."
(count 0 :type array-index :read-only t)
(min 0 :type real :read-only t)
(q25 0 :type real :read-only t)
(q50 0 :type real :read-only t)
(q75 0 :type real :read-only t)
(max 0 :type real :read-only t))
(defstruct (generic-vector-summary (:include vector-summary%))
"Summary for generic vectors."
(quantiles nil :type (or null quantiles-summary) :read-only t)
(element-count-alist nil :type list :read-only t))
(defun ensure-not-ratio (real)
"When REAL is a RATIO, convert it to a float, otherwise return as is. Used for printing."
(if (typep real 'ratio)
(float real 1.0)
real))
(defparameter *column-summary-quantiles-threshold* 10
"If the number of reals exceeds this threshold, they will be summarized with quantiles.")
(defgeneric column-summary (column)
(:documentation "Return an object that summarizes COLUMN of a DATA-FRAME. Primarily intended for printing, not analysis, returned values should print nicely.")
(:method ((column bit-vector))
(make-bit-vector-summary :length (length column) :count (count 1 column)))
(:method ((column vector))
(let+ ((length (length column))
(table (aprog1 (nu:make-sparse-counter :test #'equal)
(map nil (curry #'nu:add it) column)))
(alist (as-alist table))
((&flet real? (item) (realp (car item))))
(reals-alist (remove-if (complement #'real?) alist))
(quantiles (when (< *column-summary-quantiles-threshold*
(length reals-alist))
(let+ ((#(min q25 q50 q75 max)
(nu:weighted-quantiles
(mapcar #'car reals-alist)
(mapcar #'cdr reals-alist)
#(0 1/4 1/2 3/4 1))))
(make-quantiles-summary
:count (reduce #'+ reals-alist :key #'cdr)
:min min :q25 q25 :q50 q50 :q75 q75 :max max))))
(alist (stable-sort (if quantiles
(remove-if #'real? alist)
(copy-list alist))
#'>= :key #'cdr)))
(make-generic-vector-summary :length length
:quantiles quantiles
:element-count-alist alist))))
(defmethod print-object ((summary generic-vector-summary) stream)
(let+ (((&structure-r/o generic-vector-summary- length quantiles
element-count-alist) summary))
#+sbcl ;; complains about unreachable code
(declare (sb-ext:muffle-conditions sb-ext:compiler-note))
(pprint-logical-block (stream nil)
(pprint-logical-block (stream nil)
(when quantiles
(let+ (((&structure-r/o quantiles-summary- count min q25 q50 q75 max)
quantiles))
(format stream
"~W reals, ~:_min=~W, ~:_q25=~W, ~:_q50=~W, ~:_q75=~W, ~:_max=~W"
count min (ensure-not-ratio q25) (ensure-not-ratio q50)
(ensure-not-ratio q75) max))))
(when (and quantiles element-count-alist)
(format stream "; ")
(pprint-newline :linear stream))
(pprint-logical-block (stream element-count-alist)
(loop (pprint-exit-if-list-exhausted)
;; (when quantiles
;; (format stream ", ~@_"))
(let+ (((element . count) (pprint-pop)))
(print-count-and-percentage stream count length)
(format stream " x ~W" element))
(pprint-exit-if-list-exhausted)
(format stream ", ~_"))))))