Home Add frequency count for each category on box plot
Post
Cancel

Add frequency count for each category on box plot

Here I will show how to add frequency count to the each category in the box plot. For this we will use the searborn provided example dataset.

First, lets load the dataset:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
import searborn as sns
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

data = sns.load_dataset('tips')
tips.head()

total_bill	tip	sex	smoker	day	time	size
0	16.99	1.01	Female	No	Sun	Dinner	2
1	10.34	1.66	Male	No	Sun	Dinner	3
2	21.01	3.50	Male	No	Sun	Dinner	3
3	23.68	3.31	Male	No	Sun	Dinner	2
4	24.59	3.61	Female	No	Sun	Dinner	4

After that, lets create two separate functions to get the all the annotation informations.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
def hue_annotation_counts(df, label, yaxis_val, hue_label, order=None, hue_order=None):
    
    if order==None:
        order = list(df[label].unique())
    if hue_order == None:
        hue_order = list(df[hue_label].unique())
    width=0.8

    n_levels = len(df[hue_label].unique())
    each_width = width / n_levels
    offsets = np.linspace(0, width - each_width, n_levels)
    offsets -= offsets.mean()

    pos = [x+o for x in np.arange(len(order)) for o in offsets]

    counts = df.groupby([label,hue_label])[yaxis_val].size()

    counts = counts.reindex(pd.MultiIndex.from_product([order, hue_order ]))
    
    medians = df.groupby([label,hue_label])[yaxis_val].median()
    medians = medians.reindex(pd.MultiIndex.from_product([order, hue_order]))
    
    return pos, counts, medians, order, hue_order


def label_annotation_counts(df, label, yaxis_val, order=None):
    
    if order==None:
        order = list(df[label].unique())
 
    counts = df.groupby(label)[yaxis_val].size()
    counts = counts.reindex(index=order)

    medians = df.groupby(label)[yaxis_val].median()
    medians = medians.reindex(index=order)

    return range(len(counts)), counts, medians, order

Now, plotting boxplots without hue information:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
plt.figure(figsize=(11,7))

pos, counts, medians, order = label_annotation_counts(temp, 'day','total_bill', order)

sns.boxplot(x = 'day', y='total_bill',data= temp, order=order)

for p,n,m in zip(pos,counts,medians):
    if not np.isnan(m):
        plt.annotate('N={:.0f}'.format(n), xy=(p, m), xycoords='data', ha='center', va='bottom', size=15)
        
plt.tight_layout()
plt.ylabel('Total Bill')
plt.xlabel('Day')
plt.xticks(rotation=90)

box_plot_with_count

Now, plotting boxplots with extra information (‘sex’), which sub-divides x-label into further two categories. If you have your own order and hue order, you can pass that while calling the function.

1
2
3
4
5
6
7
8
9
10
11
12
plt.figure(figsize=(11,8))

pos, counts, medians, order, hue_order = hue_annotation_counts(temp, 'day','total_bill','sex')

sns.boxplot(x = 'day', y='total_bill',hue = 'sex',data= temp, order=order, hue_order=hue_order)

for p,n,m in zip(pos,counts,medians):
    if not np.isnan(m):
        plt.annotate('N={:.0f}'.format(n), xy=(p, m), xycoords='data', ha='center', va='bottom', size=10)
        
plt.tight_layout()
plt.xticks(rotation=90)

box_plot_with_hue_count

This post is licensed under CC BY 4.0 by the author.