Skip to content

Commit 65f5eb3

Browse files
ColtAllentwiecki
authored andcommitted
RFM Segmentation (#680)
* init rfm_segments func * TODOs * docstrings and for loop * docstrings and for loop * WIP dev notebook debugging * checkpoint commit for remote pull * code testing in dev notebook * unit tests added * dev notebook cleanup * clean up type hints * comments and code cleanup * docstrings * move formatting to rfm_summary and quickstart edits * fix rfm_train_test_split bug * added test for rfm_quartile_labels * added rfm score warning
1 parent 690a79a commit 65f5eb3

File tree

5 files changed

+498
-89
lines changed

5 files changed

+498
-89
lines changed

docs/source/notebooks/clv/clv_quickstart.ipynb

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,10 +67,10 @@
6767
"* `customer_id` represents a unique identifier for each customer.\n",
6868
"* `frequency` represents the number of _repeat_ purchases that a customer has made, i.e. one less than the total number of purchases.\n",
6969
"* `T` represents a customer's \"age\", i.e. the duration between a customer's first purchase and the end of the period of study. In this example notebook, the units of time are in weeks.\n",
70-
"* `recency` represents the timepoint when a customer made their most recent purchase. This is also equal to the duration between a customer’s first non-repeat purchase (usually time 0) and last purchase. If a customer has made only 1 purchase, their recency is 0;\n",
70+
"* `recency` represents the time period when a customer made their most recent purchase. This is equal to the duration between a customer’s first and last purchase. If a customer has made only 1 purchase, their recency is 0.\n",
7171
"* `monetary_value` represents the average value of a given customer’s repeat purchases. Customers who have only made a single purchase have monetary values of zero.\n",
7272
"\n",
73-
"If working with raw transaction data, the `rfm_summary` function can be used to preprocess data for modeling:"
73+
"The `rfm_summary` function can be used to preprocess raw transaction data for modeling:"
7474
]
7575
},
7676
{
@@ -339,6 +339,8 @@
339339
"id": "514ee548",
340340
"metadata": {},
341341
"source": [
342+
"It is important to note these definitions differ from that used in RFM segmentation, where the first purchase is included, `T` is not used, and `recency` is the number of time periods since a customer's most recent purchase.\n",
343+
"\n",
342344
"To visualize data in RFM format, we can plot the recency and T of the customers with the `plot_customer_exposure` function. We see a large chunk (>60%) of customers haven't made another purchase in a while."
343345
]
344346
},
@@ -2579,7 +2581,7 @@
25792581
"name": "python",
25802582
"nbconvert_exporter": "python",
25812583
"pygments_lexer": "ipython3",
2582-
"version": "3.9.18"
2584+
"version": "3.10.14"
25832585
},
25842586
"toc": {
25852587
"base_numbering": 1,

docs/source/notebooks/clv/dev/utilities_plotting.ipynb

Lines changed: 151 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,7 @@
55
"execution_count": 1,
66
"id": "435ed203-5c3c-4efc-93d1-abac66ce7187",
77
"metadata": {},
8-
"outputs": [
9-
{
10-
"name": "stderr",
11-
"output_type": "stream",
12-
"text": [
13-
"WARNING (pytensor.tensor.blas): Using NumPy C-API based implementation for BLAS functions.\n"
14-
]
15-
}
16-
],
8+
"outputs": [],
179
"source": [
1810
"from pymc_marketing.clv import utils\n",
1911
"\n",
@@ -30,7 +22,7 @@
3022
},
3123
{
3224
"cell_type": "code",
33-
"execution_count": 69,
25+
"execution_count": 2,
3426
"id": "7de7f396-1d5b-4457-916b-c29ed90aa132",
3527
"metadata": {},
3628
"outputs": [],
@@ -66,7 +58,7 @@
6658
},
6759
{
6860
"cell_type": "code",
69-
"execution_count": 70,
61+
"execution_count": 3,
7062
"id": "932e8db6-78cf-49df-aa4a-83ee6584e5dd",
7163
"metadata": {},
7264
"outputs": [
@@ -196,7 +188,7 @@
196188
"13 6 2015-02-02 True"
197189
]
198190
},
199-
"execution_count": 70,
191+
"execution_count": 3,
200192
"metadata": {},
201193
"output_type": "execute_result"
202194
}
@@ -223,7 +215,7 @@
223215
},
224216
{
225217
"cell_type": "code",
226-
"execution_count": 74,
218+
"execution_count": 4,
227219
"id": "4c0a7de5-8825-40af-84e5-6cd0ad26a0e3",
228220
"metadata": {},
229221
"outputs": [
@@ -259,57 +251,57 @@
259251
" <tr>\n",
260252
" <th>0</th>\n",
261253
" <td>1</td>\n",
262-
" <td>1.0</td>\n",
254+
" <td>2.0</td>\n",
263255
" <td>5.0</td>\n",
264256
" <td>5.0</td>\n",
265-
" <td>2.0</td>\n",
257+
" <td>1.5</td>\n",
266258
" </tr>\n",
267259
" <tr>\n",
268260
" <th>1</th>\n",
269261
" <td>2</td>\n",
270-
" <td>0.0</td>\n",
262+
" <td>1.0</td>\n",
271263
" <td>0.0</td>\n",
272264
" <td>5.0</td>\n",
273-
" <td>0.0</td>\n",
265+
" <td>2.0</td>\n",
274266
" </tr>\n",
275267
" <tr>\n",
276268
" <th>2</th>\n",
277269
" <td>3</td>\n",
278-
" <td>1.0</td>\n",
270+
" <td>2.0</td>\n",
279271
" <td>1.0</td>\n",
280272
" <td>5.0</td>\n",
281-
" <td>5.0</td>\n",
273+
" <td>4.5</td>\n",
282274
" </tr>\n",
283275
" <tr>\n",
284276
" <th>3</th>\n",
285277
" <td>4</td>\n",
286-
" <td>1.0</td>\n",
278+
" <td>2.0</td>\n",
287279
" <td>3.0</td>\n",
288280
" <td>3.0</td>\n",
289-
" <td>8.0</td>\n",
281+
" <td>7.0</td>\n",
290282
" </tr>\n",
291283
" <tr>\n",
292284
" <th>4</th>\n",
293285
" <td>5</td>\n",
294-
" <td>0.0</td>\n",
286+
" <td>1.0</td>\n",
295287
" <td>0.0</td>\n",
296288
" <td>3.0</td>\n",
297-
" <td>0.0</td>\n",
289+
" <td>12.0</td>\n",
298290
" </tr>\n",
299291
" </tbody>\n",
300292
"</table>\n",
301293
"</div>"
302294
],
303295
"text/plain": [
304296
" customer_id frequency recency T monetary_value\n",
305-
"0 1 1.0 5.0 5.0 2.0\n",
306-
"1 2 0.0 0.0 5.0 0.0\n",
307-
"2 3 1.0 1.0 5.0 5.0\n",
308-
"3 4 1.0 3.0 3.0 8.0\n",
309-
"4 5 0.0 0.0 3.0 0.0"
297+
"0 1 2.0 5.0 5.0 1.5\n",
298+
"1 2 1.0 0.0 5.0 2.0\n",
299+
"2 3 2.0 1.0 5.0 4.5\n",
300+
"3 4 2.0 3.0 3.0 7.0\n",
301+
"4 5 1.0 0.0 3.0 12.0"
310302
]
311303
},
312-
"execution_count": 74,
304+
"execution_count": 4,
313305
"metadata": {},
314306
"output_type": "execute_result"
315307
}
@@ -323,7 +315,7 @@
323315
" observation_period_end = \"2015-02-06\",\n",
324316
" datetime_format = \"%Y-%m-%d\",\n",
325317
" time_unit = \"W\",\n",
326-
" include_first_transaction=False,\n",
318+
" include_first_transaction=True,\n",
327319
")\n",
328320
"\n",
329321
"rfm_df.head()"
@@ -339,7 +331,7 @@
339331
},
340332
{
341333
"cell_type": "code",
342-
"execution_count": 76,
334+
"execution_count": 5,
343335
"id": "761edfe9-1b69-4966-83bf-4f1242eda2d5",
344336
"metadata": {},
345337
"outputs": [
@@ -450,7 +442,7 @@
450442
"4 0.0 5.0 "
451443
]
452444
},
453-
"execution_count": 76,
445+
"execution_count": 5,
454446
"metadata": {},
455447
"output_type": "execute_result"
456448
}
@@ -467,13 +459,137 @@
467459
"train_test.head()"
468460
]
469461
},
462+
{
463+
"cell_type": "markdown",
464+
"id": "73dc1b93-6a4f-4171-b838-30759b2c1e0e",
465+
"metadata": {},
466+
"source": [
467+
"`rfm_segments` will assign customer to segments based on their recency, frequency, and monetary value. It uses a quartile-based RFM score approach that is very computationally efficient, but defining custom segments is a rather subjective exercise. The returned dataframe also cannot be used for modeling because it does not zero out the initial transactions."
468+
]
469+
},
470470
{
471471
"cell_type": "code",
472-
"execution_count": null,
472+
"execution_count": 40,
473473
"id": "c7b3f800-8dfb-4e5a-b939-5f908281563c",
474474
"metadata": {},
475475
"outputs": [],
476-
"source": []
476+
"source": [
477+
"segments = utils.rfm_segments(\n",
478+
" test_data, \n",
479+
" customer_id_col = \"id\", \n",
480+
" datetime_col = \"date\", \n",
481+
" monetary_value_col = \"monetary_value\",\n",
482+
" observation_period_end = \"2015-02-06\",\n",
483+
" datetime_format = \"%Y-%m-%d\",\n",
484+
" time_unit = \"W\",\n",
485+
")"
486+
]
487+
},
488+
{
489+
"cell_type": "code",
490+
"execution_count": 17,
491+
"id": "932ac4e5-361e-42fa-97d3-d8e508128944",
492+
"metadata": {},
493+
"outputs": [
494+
{
495+
"data": {
496+
"text/html": [
497+
"<div>\n",
498+
"<style scoped>\n",
499+
" .dataframe tbody tr th:only-of-type {\n",
500+
" vertical-align: middle;\n",
501+
" }\n",
502+
"\n",
503+
" .dataframe tbody tr th {\n",
504+
" vertical-align: top;\n",
505+
" }\n",
506+
"\n",
507+
" .dataframe thead th {\n",
508+
" text-align: right;\n",
509+
" }\n",
510+
"</style>\n",
511+
"<table border=\"1\" class=\"dataframe\">\n",
512+
" <thead>\n",
513+
" <tr style=\"text-align: right;\">\n",
514+
" <th></th>\n",
515+
" <th>customer_id</th>\n",
516+
" <th>frequency</th>\n",
517+
" <th>recency</th>\n",
518+
" <th>monetary_value</th>\n",
519+
" <th>segment</th>\n",
520+
" </tr>\n",
521+
" </thead>\n",
522+
" <tbody>\n",
523+
" <tr>\n",
524+
" <th>0</th>\n",
525+
" <td>1</td>\n",
526+
" <td>2.0</td>\n",
527+
" <td>0.0</td>\n",
528+
" <td>1.5</td>\n",
529+
" <td>Other</td>\n",
530+
" </tr>\n",
531+
" <tr>\n",
532+
" <th>1</th>\n",
533+
" <td>2</td>\n",
534+
" <td>1.0</td>\n",
535+
" <td>5.0</td>\n",
536+
" <td>2.0</td>\n",
537+
" <td>Inactive Customer</td>\n",
538+
" </tr>\n",
539+
" <tr>\n",
540+
" <th>2</th>\n",
541+
" <td>3</td>\n",
542+
" <td>2.0</td>\n",
543+
" <td>4.0</td>\n",
544+
" <td>4.5</td>\n",
545+
" <td>At Risk Customer</td>\n",
546+
" </tr>\n",
547+
" <tr>\n",
548+
" <th>3</th>\n",
549+
" <td>4</td>\n",
550+
" <td>2.0</td>\n",
551+
" <td>0.0</td>\n",
552+
" <td>7.0</td>\n",
553+
" <td>Top Spender</td>\n",
554+
" </tr>\n",
555+
" <tr>\n",
556+
" <th>4</th>\n",
557+
" <td>5</td>\n",
558+
" <td>1.0</td>\n",
559+
" <td>3.0</td>\n",
560+
" <td>12.0</td>\n",
561+
" <td>At Risk Customer</td>\n",
562+
" </tr>\n",
563+
" <tr>\n",
564+
" <th>5</th>\n",
565+
" <td>6</td>\n",
566+
" <td>1.0</td>\n",
567+
" <td>0.0</td>\n",
568+
" <td>5.0</td>\n",
569+
" <td>Top Spender</td>\n",
570+
" </tr>\n",
571+
" </tbody>\n",
572+
"</table>\n",
573+
"</div>"
574+
],
575+
"text/plain": [
576+
" customer_id frequency recency monetary_value segment\n",
577+
"0 1 2.0 0.0 1.5 Other\n",
578+
"1 2 1.0 5.0 2.0 Inactive Customer\n",
579+
"2 3 2.0 4.0 4.5 At Risk Customer\n",
580+
"3 4 2.0 0.0 7.0 Top Spender\n",
581+
"4 5 1.0 3.0 12.0 At Risk Customer\n",
582+
"5 6 1.0 0.0 5.0 Top Spender"
583+
]
584+
},
585+
"execution_count": 17,
586+
"metadata": {},
587+
"output_type": "execute_result"
588+
}
589+
],
590+
"source": [
591+
"segments"
592+
]
477593
}
478594
],
479595
"metadata": {
@@ -492,7 +608,7 @@
492608
"name": "python",
493609
"nbconvert_exporter": "python",
494610
"pygments_lexer": "ipython3",
495-
"version": "3.9.18"
611+
"version": "3.10.14"
496612
}
497613
},
498614
"nbformat": 4,

pymc_marketing/clv/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
)
2626
from pymc_marketing.clv.utils import (
2727
customer_lifetime_value,
28+
rfm_segments,
2829
rfm_summary,
2930
rfm_train_test_split,
3031
)
@@ -39,6 +40,7 @@
3940
"plot_customer_exposure",
4041
"plot_frequency_recency_matrix",
4142
"plot_probability_alive_matrix",
43+
"rfm_segments",
4244
"rfm_summary",
4345
"rfm_train_test_split",
4446
)

0 commit comments

Comments
 (0)