"""
Script to calculate statistics.
"""
from typing import Optional, Union, List, Any
import pandas as pd
from itertools import combinations
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import scikit_posthocs as sp
import pingouin as pg
from statannotations.Annotator import (
Annotator,
) # replace with statannotations in future
from py50 import utils
import warnings
__all__ = ["Stats", "Plots"]
sns.set_style("ticks")
[docs]
class Stats:
"""
Class contains wrappers for pingouin module. The functions output data as a Pandas DataFrame. This is in a format
needed for plotting with functions in class Plots(), however they can also be used individually to output single
DataFrame for output as a csv or xlsx file using pandas.
"""
[docs]
def __init__(self, data):
if not isinstance(data, pd.DataFrame):
raise ValueError("Input must be a DataFrame")
self.data = data
[docs]
def show(self, rows=None):
"""
show DataFrame
:param rows: Int
Indicate the number of rows to display. If none, automatically show 5.
:return: DataFrame
"""
returned_df = self.data
if rows is None:
# print("rows is none") # for troubleshooting
return returned_df.head()
elif isinstance(rows, int):
# print("rows are given!") # for troubleshooting
return returned_df.head(rows)
[docs]
def get_normality(self, value_col=None, group_col=None, method="shapiro", **kwargs):
"""
Test data normality of dataset.
:param value_col: String
Name of column containing the dependent variable.
:param group_col: String
Name of columnName of column containing the grouping variable.
:param method: String
Normality test. ‘shapiro’ (default). Additional tests can be found with
[pingouin.normality()](https://pingouin-stats.org/build/html/generated/pingouin.normality.html)
:param kwargs: optional
Other options available with pingouin.normality()
:return: Pandas.DataFrame
"""
result_df = pg.normality(
data=self.data, dv=value_col, group=group_col, method=method, **kwargs
)
return result_df
[docs]
def get_homoscedasticity(self, value_col=None, group_col=None, method="levene", **kwargs):
"""
Test for data variance.
:param value_col: String
Name of column containing the dependent variable.
:param group_col: String
Name of columnName of column containing the grouping variable.
:param method: String
Statistical test. ‘levene’ (default). Additional tests can be found with
[pingouin.homoscedasticity()](https://pingouin-stats.org/build/html/generated/pingouin.homoscedasticity.html#pingouin.homoscedasticity)
:param kwargs: optional
Other options available with pingouin.homoscedasticity()
:return: Pandas.DataFrame
"""
result_df = pg.homoscedasticity(
data=self.data, dv=value_col, group=group_col, method=method, **kwargs
)
return result_df
"""
Parametric posts below
"""
[docs]
def get_anova(self, value_col=None, group_col=None, **kwargs):
"""
One-way and N-way ANOVA.
:param value_col: String
Name of column containing the dependent variable.
:param group_col: String or list of strings
Name of columnName of column containing the grouping variable.
:param kwargs: optional
Other options available with
[pingouin.anova()](https://pingouin-stats.org/build/html/generated/pingouin.anova.html)
:return: Pandas.DataFrame
"""
result_df = pg.anova(data=self.data, dv=value_col, between=group_col, **kwargs)
# Add significance asterisk
pvalue_result = [utils.star_value(value) for value in result_df.p_unc]
result_df["significance"] = pvalue_result
return result_df
[docs]
def get_welch_anova(self, value_col=None, group_col=None):
"""
One-way Welch ANOVA
:param value_col: String
Name of column containing the dependent variable.
:param group_col: String
Name of column containing the grouping variable.
:return: Pandas.DataFrame
"""
result_df = pg.welch_anova(data=self.data, dv=value_col, between=group_col)
# Add significance asterisk
pvalue_result = [utils.star_value(value) for value in result_df.p_unc]
result_df["significance"] = pvalue_result
return result_df
[docs]
def get_rm_anova(self, value_col=None, within_subject_col=None, subject_col=None, correction="auto", detailed=False,
effsize="ng2"):
"""
One-way and two-way repeated measures ANOVA.
:param value_col: String
Name of column containing the dependent variable.
:param within_subject_col: String
Name of column containing the within factor.
:param subject_col: String
Name of column containing the subject identifier.
:param correction: String or Boolean
If True, also return the Greenhouse-Geisser corrected p-value.
:param detailed: Boolean
If True, return full ANOVA table.
:param effsize: String
Effect size.
:return: Pandas.DataFrame
"""
result_df = pg.rm_anova(data=self.data, dv=value_col, within=within_subject_col, subject=subject_col,
correction=correction, detailed=detailed, effsize=effsize)
# Add significance asterisk
pvalue_result = [utils.star_value(value) for value in result_df.p_unc]
result_df["significance"] = pvalue_result
return result_df
[docs]
def get_mixed_anova(self, value_col=None, group_col=None, within_subject_col=None, subject_col=None, **kwargs):
"""
Mixed-design ANOVA.
:param value_col: String
Name of column containing the dependent variable.
:param group_col: String
Name of column containing the between factor.
:param within_subject_col: String
Name of column containing the within-subject factor (repeated measurements).
:param subject_col:
Name of column containing the between-subject identifier.
:param kwargs: optional
Other options available with
[pingouin.mixed_anova()](https://pingouin-stats.org/build/html/generated/pingouin.mixed_anova.html)
:return: Pandas.DataFrame
"""
result_df = pg.mixed_anova(data=self.data, dv=value_col, between=group_col, within=within_subject_col,
subject=subject_col, **kwargs)
# Add significance asterisk
pvalue_result = [utils.star_value(value) for value in result_df.p_unc]
result_df["significance"] = pvalue_result
return result_df
[docs]
def get_tukey(self, value_col=None, group_col=None, effsize="hedges"):
"""
Pairwise Tukey post-hoc test.
:param value_col: String
Name of column containing the dependent variable.
:param group_col: String
Name of columnName of column containing the between factor.
:param effsize: String or None
Effect size. Additional methods can be found with
[pingouin.pairwise_tukey()](https://pingouin-stats.org/build/html/generated/pingouin.pairwise_tukey.html)
:return: Pandas.DataFrame
"""
result_df = pg.pairwise_tukey(
data=self.data, dv=value_col, between=group_col, effsize=effsize
)
# Add significance asterisk
pvalue_result = [utils.star_value(value) for value in result_df.p_tukey]
result_df["significance"] = pvalue_result
return result_df
[docs]
def get_gameshowell(self, value_col=None, group_col=None, effsize="hedges"):
"""
Pairwise Games-Howell post-hoc test
:param value_col: String
Name of column containing the dependent variable.
:param group_col: String
Name of columnName of column containing the between factor.
:param effsize: String or None
Effect size. Additional methods can be found with
[pingouin.pairwise_gameshowell()](https://pingouin-stats.org/build/html/generated/pingouin.pairwise_gameshowell.html)
:return: Pandas.DataFrame
"""
result_df = pg.pairwise_gameshowell(data=self.data, dv=value_col, between=group_col, effsize=effsize)
# Add significance asterisk
pvalue_result = [utils.star_value(value) for value in result_df.pval]
result_df["significance"] = pvalue_result
return result_df
"""
non-parametric tests below
"""
[docs]
def get_wilcoxon(self, value_col=None, group_col=None, subgroup_col=None, alternative="two-sided", **kwargs):
"""
Calculate wilcoxon tests. This is non-parametric version of paired T-test. Data number must be uniform to work.
:param value_col: String
Columns containing values for testing.
:param group_col: String
Column containing group name.
:param subgroup_col: String
Column containing subgroup name.
:param alternative: String
Defines the alternative hypothesis, or tail of the test. Must be one of “two-sided”. Must be one of
“two-sided” (default), “greater” or “less”.
:param kwargs: Optional
Other options available with
[pingouin.wilcoxon()](https://pingouin-stats.org/build/html/generated/pingouin.wilcoxon.html)
:return: Pandas.DataFrame
"""
# ignore Wilcoxon warnings
warnings.filterwarnings("ignore", message="Exact p-value calculation does not work if there are zeros.*")
if subgroup_col:
# Convert 'Name' and 'Status' columns to string
self.data[group_col] = self.data[group_col].astype(str)
self.data[subgroup_col] = self.data[subgroup_col].astype(str)
self.data["subgroup"] = self.data[group_col] + "-" + self.data[subgroup_col]
subgroup_list = self.data["subgroup"].unique().tolist()
subgroup_df = self.data[self.data["subgroup"].isin(subgroup_list)].copy()
# Get unique pairs between group and subgroup
group = subgroup_df["subgroup"].unique()
# From unique items in group list, generate pairs
pairs = list(combinations(group, 2))
results_list = []
for pair in pairs:
# Get items from pair list and split by hyphen
group1, subgroup1 = pair[0].split("-", 1)
group2, subgroup2 = pair[1].split("-", 1)
# # For troubleshooting
# print("first:", data[(data[group_col] == group1)][value_col].shape)
# print("second:", data[(data[group_col] == group2)][value_col].shape)
# Check length of groups
group1_length = self.data[self.data[group_col] == group1][value_col]
group2_length = self.data[self.data[group_col] == group2][value_col]
# print(len(group1_length), len(group2_length)) # For troubleshooting
if len(group1_length) != len(group2_length):
raise ValueError(
"The lengths of the groups in group_col are not equal!"
)
# Perform Wilcoxon signed-rank test
result = pg.wilcoxon(
self.data[(self.data[group_col] == group1) & (self.data[subgroup_col] == subgroup1)][value_col],
self.data[(self.data[group_col] == group2) & (self.data[subgroup_col] == subgroup2)][value_col],
alternative=alternative, **kwargs)
# Convert significance by pvalue
pvalue_output = [utils.star_value(value) for value in result.p_val]
# Store the results in the list
results_list.append(
{
"A": f"{group1}-{subgroup1}",
"B": f"{group2}-{subgroup2}",
"W-val": result.W_val.iloc[0],
"p-val": result.p_val.iloc[0],
"significance": pvalue_output[0],
"RBC": result.RBC.iloc[0],
"CLES": result.CLES.iloc[0],
}
)
# Convert the list of dictionaries to a DataFrame
result_df = pd.DataFrame(results_list)
# Split values into and separate by comma
result_df["A"] = result_df.A.apply(lambda x: tuple(x.split("-", 1)))
result_df["B"] = result_df.B.apply(lambda x: tuple(x.split("-", 1)))
return result_df
else:
"""
No subgroups found. Tests single group and values.
"""
# Get unique pairs from group
group = self.data[group_col].unique()
# From unique items in group list, generate pairs
pairs = list(combinations(group, 2))
results_list = []
for pair in pairs:
# Get items from pair list and split by hyphen
group1 = pair[0]
group2 = pair[1]
# # For troubleshooting
# print("first:", data[(data[group_col] == group1)][value_col].shape)
# print("second:", data[(data[group_col] == group2)][value_col].shape)
# Check length of groups
group1_length = self.data[self.data[group_col] == group1][value_col]
group2_length = self.data[self.data[group_col] == group2][value_col]
# print(len(group1_length), len(group2_length)) # For troubleshooting
if len(group1_length) != len(group2_length):
raise ValueError(
"The lengths of the groups in group_col are not equal!"
)
# Perform wilcoxon
result = pg.wilcoxon(
self.data[(self.data[group_col] == group1)][value_col],
self.data[(self.data[group_col] == group2)][value_col],
alternative=alternative,
**kwargs,
)
pvalue_output = [utils.star_value(value) for value in result.p_val]
results_list.append(
{
"A": group1,
"B": group2,
"W-val": result.W_val.iloc[0],
"p-val": result.p_val.iloc[0],
"significance": pvalue_output[0],
"RBC": result.RBC.iloc[0],
"CLES": result.CLES.iloc[0],
}
)
# Convert the list of dictionaries to a DataFrame
result_df = pd.DataFrame(results_list)
# Add significance asterisk
pvalue_output = [utils.star_value(value) for value in result_df["p-val"]]
result_df["significance"] = pvalue_output
return result_df
[docs]
def get_mannu(self, value_col=None, group_col=None, subgroup_col=None, alternative="two-sided", **kwargs):
"""
Calculate Mann-Whitney U Test. This is a non-parametric version of the independent T-test.
:param self: pandas.DataFrame
Input DataFrame.
:param value_col: String
Columns containing values for testing.
:param group_col: String
Column containing group name.
:param subgroup_col: String
Column containing subgroup name.
:param alternative: String
Defines the alternative hypothesis, or tail of the test. Must be one of “two-sided”. Must be one of
“two-sided” (default), “greater” or “less”.
:param kwargs: Optional
Other options available with [pingouin.mwu()](https://pingouin-stats.org/build/html/generated/pingouin.mwu.html)
:return: Pandas.DataFrame
"""
if subgroup_col:
# Convert 'Name' and 'Status' columns to string
self.data[group_col] = self.data[group_col].astype(str)
self.data[subgroup_col] = self.data[subgroup_col].astype(str)
self.data["subgroup"] = self.data[group_col] + "-" + self.data[subgroup_col]
subgroup_list = self.data["subgroup"].unique().tolist()
subgroup_df = self.data[self.data["subgroup"].isin(subgroup_list)].copy()
# Get unique pairs between group and subgroup
group = subgroup_df["subgroup"].unique()
# From unique items in group list, generate pairs
pairs = list(combinations(group, 2))
# Check to ensure right columns selected
if self.data[group_col].dtype != "object":
raise ValueError(f"Is group_col: '{group_col}' strings?")
elif self.data[subgroup_col].dtype != "object":
raise ValueError(f"Is subgroup_col: '{subgroup_col}' strings?")
elif self.data[value_col].dtype == "object":
raise ValueError(f"Is value_col: '{value_col}' should be numerical?")
results_list = []
for pair in pairs:
# print('this is the pair:', pair) # for troubleshooting
# print('this is the pairs:', pairs)
# Get items from pair list and split by hyphen
group1, subgroup1 = pair[0].split("-", 1)
group2, subgroup2 = pair[1].split("-", 1)
# Perform mwu
result = pg.mwu(
self.data[(self.data[group_col] == group1) & (self.data[subgroup_col] == subgroup1)][value_col],
self.data[(self.data[group_col] == group2) & (self.data[subgroup_col] == subgroup2)][value_col],
alternative=alternative, **kwargs)
# Convert significance by pvalue
pvalue_output = [utils.star_value(value) for value in result.p_val]
# Store the results in the list
results_list.append(
{
"A": f"{group1}-{subgroup1}",
"B": f"{group2}-{subgroup2}",
"U-val": result.U_val.iloc[0],
"p-val": result.p_val.iloc[0],
"significance": pvalue_output[0],
"RBC": result.RBC.iloc[0],
"CLES": result.CLES.iloc[0],
}
)
# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(results_list)
# Split values into and separate by comma
df["A"] = df["A"].apply(lambda x: tuple(x.split("-", 1)))
df["B"] = df["B"].apply(lambda x: tuple(x.split("-", 1)))
return df
else:
"""
No subgroups found. Tests single group and values.
"""
# Get unique pairs from group
group = self.data[group_col].unique()
# From unique items in group list, generate pairs
pairs = list(combinations(group, 2))
results_list = []
for pair in pairs:
# Get items from pair list and split by hyphen
group1 = pair[0]
group2 = pair[1]
# Perform mwu
result = pg.mwu(
self.data[(self.data[group_col] == group1)][value_col],
self.data[(self.data[group_col] == group2)][value_col],
alternative=alternative,
**kwargs,
)
pvalue_output = [utils.star_value(value) for value in result.p_val]
results_list.append(
{
"A": group1,
"B": group2,
"U-val": result.U_val.iloc[0],
"p-val": result.p_val.iloc[0],
"significance": pvalue_output[0],
"RBC": result.RBC.iloc[0],
"CLES": result.CLES.iloc[0],
}
)
# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(results_list)
return df
[docs]
def get_kruskal(self, value_col=None, group_col=None, detailed=False):
"""
Calculate Kruskal-Wallis H-test for independent samples.
:param value_col: String
Name of column containing the dependent variable.
:param group_col: String
Name of column containing the between factor.
:param detailed: Boolean
Output additional details from Kruskal-Wallis H-test.
:return: Pandas.DataFrame
"""
result_df = pg.kruskal(
data=self.data, dv=value_col, between=group_col, detailed=detailed
)
# Add significance asterisk
pvalue_output = [utils.star_value(value) for value in result_df.p_unc]
result_df["significance"] = pvalue_output
return result_df
[docs]
def get_cochran(self, value_col=None, group_col=None, subgroup_col=None):
"""
Calculate Cochran Q Test. This is used when the dependent variable, or value_col, is binary. For details between
groups, posthoc test will be needed.
:param value_col: String
Name of column containing the dependent variable.
:param group_col: String
Name of column containing the within factor.
:param subgroup_col: String
Name of column containing the subject identifier.
:return: Pandas.DataFrame
"""
if subgroup_col:
result_df = pg.cochran(data=self.data, dv=value_col, within=subgroup_col, subject=group_col)
else:
result_df = pg.cochran(data=self.data, dv=value_col, within=group_col)
# Add significance asterisk
pvalue_output = [utils.star_value(value) for value in result_df.p_unc]
result_df["significance"] = pvalue_output
return result_df
[docs]
def get_friedman(self, group_col=None, value_col=None, subgroup_col=None, method="chisq"):
"""
Calculate Friedman Test. Determines if distributions of two or more paired samples are equal. For details between
groups, posthoc test (get_pairwise_tests(parametric=False)) will be needed.
:param value_col: String
Name of column containing the dependent variable
:param group_col: String
Name of column containing the between-subject factor.
:param subgroup_col: String
Name of column containing the subject/rater identifier
:param method: String
Statistical test to perform. Must be 'chisq' (chi-square test) or 'f' (F test). See Pingouin
documentation for further details
:return: Pandas.DataFrame
"""
# Raise error if subgroup_col not given
if subgroup_col is None:
raise ValueError("Friedman test must be in long format and requires a subgroup_col as subject")
result_df = pg.friedman(data=self.data, dv=value_col, within=group_col, subject=subgroup_col, method=method)
# Add significance asterisk
pvalue_output = [utils.star_value(value) for value in result_df.p_unc]
result_df["significance"] = pvalue_output
return result_df
"""
pairwise t-tests below
"""
[docs]
def get_pairwise_tests(self, value_col=None, group_col=None, within_subject_col=None, subject_col=None,
parametric=True, **kwargs):
"""
Posthoc test for parametric or nonparametric statistics. By default, the parametric parameter is set as True.
:param value_col: String
Name of column containing the dependent variable.
:param group_col: String or list with 2 elements
Name of column containing the between-subject factors.
:param within_subject_col: String or list with 2 elements
Name of column containing the within-subject identifier.
:param subject_col: String
Name of column containing the subject identifier. This is mandatory if subgroup_col is used.
:param parametric: Boolean
If True (default), use the parametric ttest() function. If False, use [pingouin.wilcoxon()](https://pingouin-stats.org/build/html/generated/pingouin.wilcoxon.html#pingouin.wilcoxon) or [pingouin.mwu()](https://pingouin-stats.org/build/html/generated/pingouin.mwu.html#pingouin.mwu)
for paired or unpaired samples, respectively.
:param kwargs: dict
Additional keywords arguments that are passed to [pingouin.pairwise_tests()](https://pingouin-stats.org/build/html/generated/pingouin.pairwise_tests.html#pingouin.pairwise_tests).
:return: pandas.DataFrame
"""
result_df = pg.pairwise_tests(data=self.data, dv=value_col, between=group_col, within=within_subject_col,
subject=subject_col, parametric=parametric, **kwargs)
# Add significance asterisk
pvalue_output = [utils.star_value(value) for value in result_df.p_unc]
result_df["significance"] = pvalue_output
return result_df
[docs]
def get_pairwise_rm(self, value_col=None, group_col=None, within_subject_col=None, subject_col=None,
parametric=True, **kwargs, ):
"""
Posthoc test for repeated measures.
:param value_col: String
Name of column containing the dependent variable.
:param group_col: String or list with 2 elements
Name of column containing the between-subject factors.
:param within_subject_col: String or list with 2 elements
Name of column containing the within-subject identifier.
:param subject_col: String
Name of column containing the subject identifier. This is mandatory if subgroup_col is used.
:param parametric: Boolean
If True (default), use the parametric ttest() function. If False, use [pingouin.wilcoxon()](https://pingouin-stats.org/build/html/generated/pingouin.wilcoxon.html#pingouin.wilcoxon) or [pingouin.mwu()](https://pingouin-stats.org/build/html/generated/pingouin.mwu.html#pingouin.mwu)
for paired or unpaired samples, respectively.
:param kwargs: dict
Additional keywords arguments that are passed to [pingouin.pairwise_tests()](https://pingouin-stats.org/build/html/generated/pingouin.pairwise_tests.html#pingouin.pairwise_tests).
:return: pandas.DataFrame
"""
result_df = pg.pairwise_tests(data=self.data, dv=value_col, between=group_col, within=within_subject_col,
subject=subject_col, parametric=parametric, **kwargs)
# Add significance asterisk
pvalue_output = [utils.star_value(value) for value in result_df.p_unc]
result_df["significance"] = pvalue_output
return result_df
[docs]
def get_pairwise_mixed(self, value_col=None, group_col=None, within_subject_col=None, subject_col=None,
parametric=True, **kwargs):
"""
Posthoc test for mixed ANOVA.
:param value_col: String
Name of column containing the dependent variable.
:param group_col: String or list with 2 elements
Name of column containing the between-subject factors.
:param within_subject_col: String or list with 2 elements
Name of column containing the within-subject identifier.
:param subject_col: String
Name of column containing the subject identifier. This is mandatory if subgroup_col is used.
:param parametric: Boolean
If True (default), use the parametric ttest() function. If False, use [pingouin.wilcoxon()](https://pingouin-stats.org/build/html/generated/pingouin.wilcoxon.html#pingouin.wilcoxon) or [pingouin.mwu()](https://pingouin-stats.org/build/html/generated/pingouin.mwu.html#pingouin.mwu)
for paired or unpaired samples, respectively.
:param kwargs: dict
Additional keywords arguments that are passed to [pingouin.pairwise_tests()](https://pingouin-stats.org/build/html/generated/pingouin.pairwise_tests.html#pingouin.pairwise_tests).
:return: pandas.DataFrame
"""
result_df = pg.pairwise_tests(data=self.data, dv=value_col, between=group_col, within=within_subject_col,
subject=subject_col, parametric=parametric, **kwargs)
# Add significance asterisk
pvalue_output = [utils.star_value(value) for value in result_df.p_unc]
result_df["significance"] = pvalue_output
return result_df
"""
Output P-Values as a matrix in Pandas DataFrame
"""
[docs]
@staticmethod
def get_p_matrix(data, test=None, group_col1=None, group_col2=None, order=None):
"""
Convert dataframe of statistic results into a matrix. Group columns must be indicated. Group 2 is optional and
depends on test used (i.e. pairwise vs Mann-Whitney U). Final DataFrame output can be used with the
Plots.p_matrix() function to generate a heatmap of p-values.
:param data: pandas.DataFrame
Input DataFrame. Must be of already computed test results.
:param group_col1: String
Name of column containing the group
:param group_col2: String
Name of column containing the second group. This variable is optional.
:param test: String
Name of the test used to calculate statistics.
:param order: List or String == "alpha"
Reorder the groups for the final table. If input is string "alpha", the order of the groups will be
alphabetized.
:return:
"""
matrix_df = utils.multi_group(data, group_col1, group_col2, test, order)
return matrix_df
"""
Function to detail significance column meaning
"""
[docs]
@staticmethod
def explain_significance():
"""
Print out DataFrame containing explanations for star values. This is used for reference. See [GraphPad](https://www.graphpad.com/support/faq/what-is-the-meaning-of--or--or--in-reports-of-statistical-significance-from-prism-or-instat/)
:return: pandas.DataFrame
"""
df = pd.DataFrame(
{
"pvalue": [
"p > 0.05",
"p ≤ 0.05",
" p ≤ 0.01",
"p ≤ 0.001",
"p ≤ 0.0001",
],
"p_value": ["No Significance (n.s.)", "*", "**", "***", "****"],
}
)
return df
[docs]
class Plots(Stats):
[docs]
def __init__(self, data):
super().__init__(data)
[docs]
@staticmethod
def list_test():
"""
List all tests available for plotting
:return:
"""
print(
"List of tests available for plotting: 'tukey', 'gameshowell', 'pairwise-parametric', 'pairwise-rm', 'pairwise-mixed', 'pairwise-nonparametric', 'wilcoxon', 'mannu', 'kruskal'"
"'kruskal'"
)
[docs]
def boxplot(self, test=None, group_col=None, value_col=None, group_order=None, subgroup_col=None, subject_col=None,
within_subject_col=None, pairs=None, pvalue_label=None, hide_ns=False, palette=None, orient="v",
loc="inside", whis=1.5, return_df=None, **kwargs):
"""
Draw a boxplot from the input DataFrame.
:param test: String
Name of test for calculations. Names must match the test names from the py50.Stats()
:param group_col: String
Name of column containing groups. This should be the between depending on the selected test.
:param value_col: String
Name of the column containing the values. This is the dependent variable.
:param group_order: List.
Place the groups in a specific order on the plot.
:param subgroup_col: String
Name of the column containing the subgroup for the group column. This is associated with the hue parameters
in Seaborn.
:param subject_col: String
Name of the column containing the subject column.
:param within_subject_col: String
Name of the column containing the within subject column.
:param pairs: List
A list containing specific pairings for annotation on the plot.
:param pvalue_label: List.
A list containing specific pvalue labels. This order must match the length of pairs list.
:param hide_ns: bool
Automatically hide groups with no significance from plot.
:param palette: String or List.
Color palette used for the plot. Can be given as common color name or in hex code.
:param orient: String
Orientation of the plot. Only "v" and "h" are for vertical and horizontal, respectively, is supported
:param loc: String
Set location of annotations. Only "inside" or "outside" are supported.
:param whis: Int
Set length of whiskers on plot.
:param return_df: Boolean
Returns a DataFrame of calculated results. If pairs used, only return rows with annotated pairs.
:return: Fig
"""
# separate kwargs for sns and sns
valid_sns = utils.get_kwargs(sns.boxplot)
valid_annot = utils.get_kwargs(Annotator)
sns_kwargs = {key: value for key, value in kwargs.items() if key in valid_sns}
annot_kwargs = {key: value for key, value in kwargs.items() if key in valid_annot}
# Perform Stat calculations and get pairs and pvalue for annotation
pairs, pvalue_plot, stat_df_result = Plots._get_test(self, group_col, kwargs, pairs, subgroup_col, subject_col,
within_subject_col, test, value_col)
# Set kwargs dictionary for line annotations
annotate_kwargs = {}
if "line_offset_to_group" in kwargs and "line_offset" in kwargs:
# Get kwargs from input
line_offset_to_group = kwargs["line_offset_to_group"]
line_offset = kwargs["line_offset"]
# Add to dictionary
annotate_kwargs["line_offset_to_group"] = line_offset_to_group
annotate_kwargs["line_offset"] = line_offset
# Set order for groups on plot
if group_order:
group_order = group_order
# hide ns label
if hide_ns:
pairs, pvalue_plot, data_is_sig = _option_to_hide_ns(hide_ns, pairs, pvalue_plot)
else:
data_is_sig = True
# set orientation for plot and Annotator
orient = orient.lower()
if orient == "v":
x_input = group_col
y_input = value_col
elif orient == "h":
x_input = value_col
y_input = group_col
else:
raise ValueError("Orientation must be 'v' or 'h'!")
# set optional subgroup_col
if subgroup_col:
subgroup_hue = subgroup_col
else:
subgroup_hue = group_col
# plot
ax = sns.boxplot(data=self.data, x=x_input, y=y_input, order=group_order, palette=palette, hue=subgroup_hue,
whis=whis, **sns_kwargs)
annotator = Annotator(ax, pairs=pairs, data=self.data, x=x_input, y=y_input, order=group_order, verbose=False,
orient=orient, hue=subgroup_hue, **annot_kwargs)
# Optional input to make custom labels
if pvalue_label:
pvalue_plot = pvalue_label
# Location of annotations
if loc not in ["inside", "outside"]:
raise ValueError("Invalid loc! Only 'inside' or 'outside' are accepted!")
if loc == "inside":
annotator.configure(loc=loc, test=None)
else:
annotator.configure(loc=loc, test=None)
# Make sure the pairs and pvalue lists match
if len(pairs) != len(pvalue_plot):
raise Exception("pairs and pvalue_order length does not match!")
# if no significance
elif not data_is_sig:
pass
else:
annotator.set_custom_annotations(pvalue_plot)
annotator.annotate(**annotate_kwargs)
# Adjust title and title titlesize from kwargs
title = kwargs.pop("title", None)
titlesize = kwargs.pop("titlesize", None)
if title:
plt.title(title)
if title and titlesize:
plt.title(title, fontsize=titlesize)
plt.tight_layout()
# Return DataFrame AND figure
if return_df:
return stat_df_result, annotator
return annotator
[docs]
def barplot(self, test=None, group_col=None, value_col=None, group_order=None, subgroup_col=None, subject_col=None,
within_subject_col=None, pairs=None, pvalue_label=None, hide_ns=False, palette=None, orient="v",
loc="inside", errorbar="sd", capsize=0.1, return_df=None, **kwargs):
"""
Draw a barplot from the input DataFrame.
:param test: String
Name of test for calculations. Names must match the test names from the py50.Stats()
:param group_col: String
Name of column containing groups. This should be the between depending on the selected test.
:param value_col: String
Name of the column containing the values. This is the dependent variable.
:param group_order: List.
Place the groups in a specific order on the plot.
:param subgroup_col: String
Name of the column containing the subgroup for the group column. This is associated with the hue parameters
in Seaborn.
:param subject_col: String
Name of the column containing the subject column.
:param within_subject_col: String
Name of the column containing the within subject column.
:param pairs: List
A list containing specific pairings for annotation on the plot.
:param pvalue_label: List.
A list containing specific pvalue labels. This order must match the length of pairs list.
:param hide_ns: bool
Automatically hide groups with no significance from plot.
:param palette: String or List.
Color palette used for the plot. Can be given as common color name or in hex code.
:param orient: String
Orientation of the plot. Only "v" and "h" are for vertical and horizontal, respectively, is supported
:param loc: String
Set location of annotations. Only "inside" or "outside" are supported.
:param errorbar: String
Set confidence interval on plot.
:param capsize: Int
Set cap size on plot.
:param return_df: Boolean
Returns a DataFrame of calculated results. If pairs used, only return rows with annotated pairs.
:return:
"""
valid_sns = utils.get_kwargs(sns.barplot)
valid_annot = utils.get_kwargs(Annotator)
sns_kwargs = {key: value for key, value in kwargs.items() if key in valid_sns}
annot_kwargs = {key: value for key, value in kwargs.items() if key in valid_annot}
# Perform Stat calculations and get pairs and pvalue for annotation
pairs, pvalue_plot, stat_df_result = Plots._get_test(self, group_col, kwargs, pairs, subgroup_col, subject_col,
within_subject_col, test, value_col)
# Set kwargs dictionary for line annotations
annotate_kwargs = {}
if "line_offset_to_group" in kwargs and "line_offset" in kwargs:
# Get kwargs from input
line_offset_to_group = kwargs["line_offset_to_group"]
line_offset = kwargs["line_offset"]
# Add to dictionary
annotate_kwargs["line_offset_to_group"] = line_offset_to_group
annotate_kwargs["line_offset"] = line_offset
# Set order for groups on plot
if group_order:
group_order = group_order
# hide ns label
if hide_ns:
pairs, pvalue_plot, data_is_sig = _option_to_hide_ns(hide_ns, pairs, pvalue_plot)
else:
data_is_sig = True
# set orientation for plot and Annotator
orient = orient.lower()
if orient == "v":
x_input = group_col
y_input = value_col
elif orient == "h":
x_input = value_col
y_input = group_col
else:
raise ValueError("Orientation must be 'v' or 'h'!")
# set optional subgroup_col
if subgroup_col:
subgroup_hue = subgroup_col
else:
subgroup_hue = group_col
# plot
ax = sns.barplot(data=self.data, x=x_input, y=y_input, order=group_order, palette=palette, hue=subgroup_hue,
errorbar=errorbar, capsize=capsize, **sns_kwargs)
annotator = Annotator(ax, pairs=pairs, data=self.data, x=x_input, y=y_input, order=group_order, verbose=False,
orient=orient, hue=subgroup_hue, **annot_kwargs)
# optional input for custom annotations
if pvalue_label:
pvalue_plot = pvalue_label
# Location of annotations
if loc not in ["inside", "outside"]:
raise ValueError("Invalid loc! Only 'inside' or 'outside' are accepted!")
if loc == "inside":
annotator.configure(loc=loc, test=None)
else:
annotator.configure(loc=loc, test=None)
# Make sure the pairs and pvalue lists match
if len(pairs) != len(pvalue_plot):
raise Exception("pairs and pvalue_order length does not match!")
# if no significance
elif not data_is_sig:
pass
else:
annotator.set_custom_annotations(pvalue_plot)
annotator.annotate(**annotate_kwargs)
# Adjust title and title titlesize from kwargs
title = kwargs.pop("title", None)
titlesize = kwargs.pop("titlesize", None)
if title:
plt.title(title)
if title and titlesize:
plt.title(title, fontsize=titlesize)
plt.tight_layout()
# Return DataFrame AND figure
if return_df:
return stat_df_result, annotator
return annotator
[docs]
def violinplot(self, test=None, group_col=None, value_col=None, group_order=None, subgroup_col=None,
subject_col=None, within_subject_col=None, pairs=None, pvalue_label=None, hide_ns=False,
palette=None, orient="v", loc="inside", return_df=None, **kwargs):
"""
Draw a violinplot from the input DataFrame.
:param test: String
Name of test for calculations. Names must match the test names from the py50.Stats()
:param group_col: String
Name of column containing groups. This should be the between depending on the selected test.
:param value_col: String
Name of the column containing the values. This is the dependent variable.
:param group_order: List.
Place the groups in a specific order on the plot.
:param subgroup_col: String
Name of the column containing the subgroup for the group column. This is associated with the hue parameters
in Seaborn.
:param subject_col: String
Name of the column containing the subject column.
:param within_subject_col: String
Name of the column containing the within subject column.
:param pairs: List
A list containing specific pairings for annotation on the plot.
:param pvalue_label: List.
A list containing specific pvalue labels. This order must match the length of pairs list.
:param hide_ns: bool
Automatically hide groups with no significance from plot.
:param palette: String or List.
Color palette used for the plot. Can be given as common color name or in hex code.
:param orient: String
Orientation of the plot. Only "v" and "h" are for vertical and horizontal, respectively, is supported
:param loc: String
Set location of annotations. Only "inside" or "outside" are supported.
:param return_df: Boolean
Returns a DataFrame of calculated results. If pairs used, only return rows with annotated pairs.
:return:
"""
# separate kwargs for sns and sns
valid_sns = utils.get_kwargs(sns.violinplot)
valid_annot = utils.get_kwargs(Annotator)
sns_kwargs = {key: value for key, value in kwargs.items() if key in valid_sns}
annot_kwargs = {key: value for key, value in kwargs.items() if key in valid_annot}
# Perform Stat calculations and get pairs and pvalue for annotation
pairs, pvalue_plot, stat_df_result = Plots._get_test(self, group_col, kwargs, pairs, subgroup_col, subject_col,
within_subject_col, test, value_col)
# Set kwargs dictionary for line annotations
annotate_kwargs = {}
if "line_offset_to_group" in kwargs and "line_offset" in kwargs:
# Get kwargs from input
line_offset_to_group = kwargs["line_offset_to_group"]
line_offset = kwargs["line_offset"]
# Add to dictionary
annotate_kwargs["line_offset_to_group"] = line_offset_to_group
annotate_kwargs["line_offset"] = line_offset
# Set order for groups on plot
if group_order:
group_order = group_order
# hide ns label
if hide_ns:
pairs, pvalue_plot, data_is_sig = _option_to_hide_ns(hide_ns, pairs, pvalue_plot)
else:
data_is_sig = True
# set orientation for plot and Annotator
orient = orient.lower()
if orient == "v":
x_input = group_col
y_input = value_col
elif orient == "h":
x_input = value_col
y_input = group_col
else:
raise ValueError("Orientation must be 'v' or 'h'!")
# set optional subgroup_col
if subgroup_col:
subgroup_hue = subgroup_col
else:
subgroup_hue = group_col
# plot
ax = sns.violinplot(data=self.data, x=x_input, y=y_input, order=group_order, palette=palette, hue=subgroup_hue,
**sns_kwargs)
annotator = Annotator(ax, pairs=pairs, data=self.data, x=x_input, y=y_input, order=group_order, verbose=False,
orient=orient, hue=subgroup_hue, **annot_kwargs)
# optional input for custom annotations
if pvalue_label:
pvalue_plot = pvalue_label
# Location of annotations
if loc not in ["inside", "outside"]:
raise ValueError("Invalid loc! Only 'inside' or 'outside' are accepted!")
if loc == "inside":
annotator.configure(loc=loc, test=None)
else:
annotator.configure(loc=loc, test=None)
# Make sure the pairs and pvalue lists match
if len(pairs) != len(pvalue_plot):
raise Exception("pairs and pvalue_order length does not match!")
# if no significance
elif not data_is_sig:
pass
else:
annotator.set_custom_annotations(pvalue_plot)
annotator.annotate(**annotate_kwargs)
# Adjust title and title titlesize from kwargs
title = kwargs.pop("title", None)
titlesize = kwargs.pop("titlesize", None)
if title:
plt.title(title)
if title and titlesize:
plt.title(title, fontsize=titlesize)
plt.tight_layout()
# Return DataFrame AND figure
if return_df:
return stat_df_result, annotator
else:
return annotator
[docs]
def swarmplot(self, test=None, group_col=None, value_col=None, group_order=None, subgroup_col=None,
subject_col=None, within_subject_col=None, pairs=None, pvalue_label=None, hide_ns=False, palette=None,
orient="v", loc="inside", return_df=None, **kwargs):
"""
Draw a swarm plot from the input DataFrame.
:param test: String
Name of test for calculations. Names must match the test names from the py50.Stats()
:param group_col: String
Name of column containing groups. This should be the between depending on the selected test.
:param value_col: String
Name of the column containing the values. This is the dependent variable.
:param group_order: List.
Place the groups in a specific order on the plot.
:param subgroup_col: String
Name of the column containing the subgroup for the group column. This is associated with the hue parameters
in Seaborn.
:param subject_col: String
Name of the column containing the subject column.
:param within_subject_col: String
Name of the column containing the within subject column.
:param pairs: List
A list containing specific pairings for annotation on the plot.
:param pvalue_label: List.
A list containing specific pvalue labels. This order must match the length of pairs list.
:param hide_ns: bool
Automatically hide groups with no significance from plot.
:param palette: String or List.
Color palette used for the plot. Can be given as common color name or in hex code.
:param orient: String
Orientation of the plot. Only "v" and "h" are for vertical and horizontal, respectively, is supported
:param loc: String
Set location of annotations. Only "inside" or "outside" are supported.
:param return_df: Boolean
Returns a DataFrame of calculated results. If pairs used, only return rows with annotated pairs.
:return:
"""
# remove palette/hue warning
warnings.filterwarnings("ignore", category=FutureWarning)
# separate kwargs for sns and sns
valid_sns = utils.get_kwargs(sns.swarmplot)
valid_annot = utils.get_kwargs(Annotator)
sns_kwargs = {key: value for key, value in kwargs.items() if key in valid_sns}
annot_kwargs = {key: value for key, value in kwargs.items() if key in valid_annot}
# Perform Stat calculations and get pairs and pvalue for annotation
pairs, pvalue_plot, stat_df = Plots._get_test(self, group_col, kwargs, pairs, subgroup_col, subject_col,
within_subject_col, test, value_col)
# Set kwargs dictionary for line annotations
annotate_kwargs = {}
if "line_offset_to_group" in kwargs and "line_offset" in kwargs:
# Get kwargs from input
line_offset_to_group = kwargs["line_offset_to_group"]
line_offset = kwargs["line_offset"]
# Add to dictionary
annotate_kwargs["line_offset_to_group"] = line_offset_to_group
annotate_kwargs["line_offset"] = line_offset
# Set order for groups on plot
if group_order:
group_order = group_order
# hide ns label
if hide_ns:
pairs, pvalue_plot, data_is_sig = _option_to_hide_ns(hide_ns, pairs, pvalue_plot)
else:
data_is_sig = True
# set orientation for plot and Annotator
orient = orient.lower()
if orient == "v":
x_input = group_col
y_input = value_col
elif orient == "h":
x_input = value_col
y_input = group_col
else:
raise ValueError("Orientation must be 'v' or 'h'!")
# set optional subgroup_col
if subgroup_col:
subgroup_hue = subgroup_col
else:
subgroup_hue = group_col
# plot
ax = sns.swarmplot(data=self.data, x=x_input, y=y_input, order=group_order, palette=palette, hue=subgroup_hue,
**sns_kwargs)
annotator = Annotator(ax, pairs=pairs, data=self.data, x=x_input, y=y_input, order=group_order, verbose=False,
orient=orient, hue=subgroup_hue, **annot_kwargs, )
# optional input for custom annotations
if pvalue_label:
pvalue_plot = pvalue_label
# # For debugging pairs and pvalue list orders
# print(pairs)
# print(pvalue)
# Location of annotations
if loc not in ["inside", "outside"]:
raise ValueError("Invalid loc! Only 'inside' or 'outside' are accepted!")
if loc == "inside":
annotator.configure(loc=loc, test=None)
else:
annotator.configure(loc=loc, test=None)
# Make sure the pairs and pvalue lists match
if len(pairs) != len(pvalue_plot):
raise Exception("pairs and pvalue_order length does not match!")
# if no significance
elif not data_is_sig:
pass
else:
annotator.set_custom_annotations(pvalue_plot)
annotator.annotate(**annotate_kwargs)
# Adjust title and title titlesize from kwargs
title = kwargs.pop("title", None)
titlesize = kwargs.pop("titlesize", None)
if title:
plt.title(title)
if title and titlesize:
plt.title(title, fontsize=titlesize)
plt.tight_layout()
# Return DataFrame AND figure
if return_df:
return stat_df, annotator
return annotator
[docs]
def stripplot(self, test=None, group_col=None, value_col=None, group_order=None, subgroup_col=None,
subject_col=None, within_subject_col=None, pairs=None, pvalue_label=None, hide_ns=False, palette=None,
orient="v", loc="inside", return_df=None, **kwargs):
"""
Draw a stripplot from the input DataFrame.
:param test: String
Name of test for calculations. Names must match the test names from the py50.Stats()
:param group_col: String
Name of column containing groups. This should be the between depending on the selected test.
:param value_col: String
Name of the column containing the values. This is the dependent variable.
:param group_order: List.
Place the groups in a specific order on the plot.
:param subgroup_col: String
Name of the column containing the subgroup for the group column. This is associated with the hue parameters
in Seaborn.
:param subject_col: String
Name of the column containing the subject column.
:param within_subject_col: String
Name of the column containing the within subject column.
:param pairs: List
A list containing specific pairings for annotation on the plot.
:param pvalue_label: List.
A list containing specific pvalue labels. This order must match the length of pairs list.
:param hide_ns: bool
Automatically hide groups with no significance from plot.
:param palette: String or List.
Color palette used for the plot. Can be given as common color name or in hex code.
:param orient: String
Orientation of the plot. Only "v" and "h" are for vertical and horizontal, respectively, is supported
:param loc: String
Set location of annotations. Only "inside" or "outside" are supported.
:param return_df: Boolean
Returns a DataFrame of calculated results. If pairs used, only return rows with annotated pairs.
:return:
"""
# remove palette/hue warning
warnings.filterwarnings("ignore", category=FutureWarning)
# separate kwargs for sns and sns
valid_sns = utils.get_kwargs(sns.stripplot)
valid_annot = utils.get_kwargs(Annotator)
sns_kwargs = {key: value for key, value in kwargs.items() if key in valid_sns}
annot_kwargs = {key: value for key, value in kwargs.items() if key in valid_annot}
# Perform Stat calculations and get pairs and pvalue for annotation
pairs, pvalue_plot, stat_df_result = Plots._get_test(self, group_col, kwargs, pairs, subgroup_col, subject_col,
within_subject_col, test, value_col)
# Set kwargs dictionary for line annotations
annotate_kwargs = {}
if "line_offset_to_group" in kwargs and "line_offset" in kwargs:
# Get kwargs from input
line_offset_to_group = kwargs["line_offset_to_group"]
line_offset = kwargs["line_offset"]
# Add to dictionary
annotate_kwargs["line_offset_to_group"] = line_offset_to_group
annotate_kwargs["line_offset"] = line_offset
# Set order for groups on plot
if group_order:
group_order = group_order
# hide ns label
if hide_ns:
pairs, pvalue_plot, data_is_sig = _option_to_hide_ns(hide_ns, pairs, pvalue_plot)
else:
data_is_sig = True
# set orientation for plot and Annotator
orient = orient.lower()
if orient == "v":
x_input = group_col
y_input = value_col
elif orient == "h":
x_input = value_col
y_input = group_col
else:
raise ValueError("Orientation must be 'v' or 'h'!")
# set optional subgroup_col
if subgroup_col:
subgroup_hue = subgroup_col
else:
subgroup_hue = group_col
# plot
ax = sns.stripplot(data=self.data, x=x_input, y=y_input, order=group_order, palette=palette, hue=subgroup_hue,
**sns_kwargs)
annotator = Annotator(ax, pairs=pairs, data=self.data, x=x_input, y=y_input, order=group_order, verbose=False,
orient=orient, hue=subgroup_hue, **annot_kwargs)
# optional input for custom annotations
if pvalue_label:
pvalue_plot = pvalue_label
# # For debugging pairs and pvalue list orders
# print(pairs)
# print(pvalue)
# Location of annotations
if loc not in ["inside", "outside"]:
raise ValueError("Invalid loc! Only 'inside' or 'outside' are accepted!")
if loc == "inside":
annotator.configure(loc=loc, test=None)
else:
annotator.configure(loc=loc, test=None)
# Make sure the pairs and pvalue lists match
if len(pairs) != len(pvalue_plot):
raise Exception("pairs and pvalue_order length does not match!")
# if no significance
elif not data_is_sig:
pass
else:
annotator.set_custom_annotations(pvalue_plot)
annotator.annotate(**annotate_kwargs)
# Adjust title and title titlesize from kwargs
title = kwargs.pop("title", None)
titlesize = kwargs.pop("titlesize", None)
if title:
plt.title(title)
if title and titlesize:
plt.title(title, fontsize=titlesize)
plt.tight_layout()
# Return DataFrame AND figure
if return_df:
return stat_df_result, annotator
return annotator
[docs]
def boxenplot(self, test=None, group_col=None, value_col=None, group_order=None, subgroup_col=None,
subject_col=None, within_subject_col=None, pairs=None, pvalue_label=None, hide_ns=False, palette=None,
orient="v", loc="inside", return_df=None, **kwargs):
"""
Draw a boxenplot from the input DataFrame.
:param test: String
Name of test for calculations. Names must match the test names from the py50.Stats()
:param group_col: String
Name of column containing groups. This should be the between depending on the selected test.
:param value_col: String
Name of the column containing the values. This is the dependent variable.
:param group_order: List.
Place the groups in a specific order on the plot.
:param subgroup_col: String
Name of the column containing the subgroup for the group column. This is associated with the hue parameters
in Seaborn.
:param subject_col: String
Name of the column containing the subject column.
:param within_subject_col: String
Name of the column containing the within subject column.
:param pairs: List
A list containing specific pairings for annotation on the plot.
:param pvalue_label: List.
A list containing specific pvalue labels. This order must match the length of pairs list.
:param hide_ns: bool
Automatically hide groups with no significance from plot.
:param palette: String or List.
Color palette used for the plot. Can be given as common color name or in hex code.
:param orient: String
Orientation of the plot. Only "v" and "h" are for vertical and horizontal, respectively, is supported
:param loc: String
Set location of annotations. Only "inside" or "outside" are supported.
:param return_df: Boolean
Returns a DataFrame of calculated results. If pairs used, only return rows with annotated pairs.
:return:
"""
# separate kwargs for sns and sns
valid_sns = utils.get_kwargs(sns.boxenplot)
valid_annot = utils.get_kwargs(Annotator)
sns_kwargs = {key: value for key, value in kwargs.items() if key in valid_sns}
annot_kwargs = {key: value for key, value in kwargs.items() if key in valid_annot}
# Perform Stat calculations and get pairs and pvalue for annotation
pairs, pvalue_plot, stat_df_result = Plots._get_test(self, group_col, kwargs, pairs, subgroup_col, subject_col,
within_subject_col, test, value_col)
# Set kwargs dictionary for line annotations
annotate_kwargs = {}
if "line_offset_to_group" in kwargs and "line_offset" in kwargs:
# Get kwargs from input
line_offset_to_group = kwargs["line_offset_to_group"]
line_offset = kwargs["line_offset"]
# Add to dictionary
annotate_kwargs["line_offset_to_group"] = line_offset_to_group
annotate_kwargs["line_offset"] = line_offset
# Set order for groups on plot
if group_order:
group_order = group_order
# hide ns label
if hide_ns:
pairs, pvalue_plot, data_is_sig = _option_to_hide_ns(hide_ns, pairs, pvalue_plot)
else:
data_is_sig = True
# set orientation for plot and Annotator
orient = orient.lower()
if orient == "v":
x_input = group_col
y_input = value_col
elif orient == "h":
x_input = value_col
y_input = group_col
else:
raise ValueError("Orientation must be 'v' or 'h'!")
# set optional subgroup_col
if subgroup_col:
subgroup_hue = subgroup_col
else:
subgroup_hue = group_col
# plot
ax = sns.boxenplot(data=self.data, x=x_input, y=y_input, order=group_order, palette=palette, hue=subgroup_hue,
**sns_kwargs)
annotator = Annotator(ax, pairs=pairs, data=self.data, x=x_input, y=y_input, order=group_order, verbose=False,
orient=orient, hue=subgroup_hue, **annot_kwargs)
# optional input for custom annotations
if pvalue_label:
pvalue_plot = pvalue_label
# # For debugging pairs and pvalue list orders
# print(pairs)
# print(pvalue)
# Location of annotations
if loc not in ["inside", "outside"]:
raise ValueError("Invalid loc! Only 'inside' or 'outside' are accepted!")
if loc == "inside":
annotator.configure(loc=loc, test=None)
else:
annotator.configure(loc=loc, test=None)
# Make sure the pairs and pvalue lists match
if len(pairs) != len(pvalue_plot):
raise Exception("pairs and pvalue_order length does not match!")
# if no significance
elif not data_is_sig:
pass
else:
annotator.set_custom_annotations(pvalue_plot)
annotator.annotate(**annotate_kwargs)
# Adjust title and title titlesize from kwargs
title = kwargs.pop("title", None)
titlesize = kwargs.pop("titlesize", None)
if title:
plt.title(title)
if title and titlesize:
plt.title(title, fontsize=titlesize)
plt.tight_layout()
# Return DataFrame AND figure
if return_df:
return stat_df_result, annotator
return annotator
[docs]
def ci_plot(self, data: Optional = None, value_col: str = None, group_col: str = None, alpha: float = 0.05,
title: str = "Tukey HSD Confidence Intervals", xlabel: str = None, ylabel: str = None,
linewidth: float = 1.5, figsize: tuple = (8, 6), return_stats: bool = False):
"""
Generate a confidence interval plot. The plot utilizes the Tukey Honest Significant Difference (HSD) test and
is a wrapper for statsmodels (https://www.statsmodels.org/dev/index.html). ANOVA will also be calculated and
its p-value will be plotted alongside the title.
:param data: Optional
Input dataset.
:param value_col: str
Name of the column containing the dependent variable.
:param group_col: str
Name of the column containing the groups.
:param alpha: float
The significance level for the test.
:param title: str
Set the title for the figure. Defaults to "Tukey HSD Confidence Intervals".
:param xlabel: str
Set the label for the x-axis. If None is given, defaults to the value_col input.
:param ylabel: str
Set the label for the y-axis. If None is given, defaults to the group_col input.
:param linewidth: float
Set the width of the lines.
:param figsize: tuple
Set the figure size. Defaults to (8,6).
:param return_stats: bool
Whether to return the Tukey HSD test.
:return:
"""
if data is None:
data = self.data
# calculate tukey using statsmodels
tukey = pairwise_tukeyhsd(endog=data[value_col], groups=data[group_col], alpha=alpha)
# calculate anova
stat = Stats(data=self.data)
anova_table = stat.get_anova(value_col=value_col, group_col=group_col)
anova = anova_table.p_unc.iloc[0]
# identify highest mean
group_means = data.groupby(group_col)[value_col].mean()
best_group = group_means.idxmax()
# set labels
if xlabel is None:
xlabel = value_col
if ylabel is None:
ylabel = group_col
# optional return of calculated stats
if return_stats:
print(tukey.summary())
# plot CI
fig, ax = plt.subplots(figsize=figsize)
tukey.plot_simultaneous(comparison_name=best_group, ax=ax)
ax = fig.axes[0]
# change line thickness
for collection in ax.collections:
collection.set_linewidth(linewidth)
# dynamically set center dot size
for line in ax.lines:
if line.get_linestyle() != '--': # avoid dash vertical
original_marker_size = 8
marker_size = original_marker_size * (linewidth / 2)
if marker_size < original_marker_size:
marker_size = original_marker_size
line.set_markersize(marker_size)
plt.title(f"{title} | ANOVA p={anova:.3f}", fontsize=18)
plt.xlabel(xlabel, fontsize=12, labelpad=10)
plt.ylabel(ylabel, fontsize=12, labelpad=10)
plt.tight_layout()
# todo add support for lineplot
def _lineplot(self, test=None, group_col=None, value_col=None, group_order=None, subgroup_col=None,
subject_col=None, within_subject_col=None, pairs=None, pvalue_label=None, palette=None, orient="v",
loc="inside", ci="sd", capsize=0.1, return_df=None, **kwargs):
"""
Draw a lineplot from the input DataFrame.
:param test: String
Name of test for calculations. Names must match the test names from the py50.Stats()
:param group_col: String
Name of column containing groups. This should be the between depending on the selected test.
:param value_col: String
Name of the column containing the values. This is the dependent variable.
:param group_order: List.
Place the groups in a specific order on the plot.
:param subgroup_col: String
Name of the column containing the subgroup for the group column. This is associated with the hue parameters
in Seaborn.
:param subject_col: String
Name of the column containing the subject column.
:param within_subject_col: String
Name of the column containing the within subject column.
:param pairs: List
A list containing specific pairings for annotation on the plot.
:param pvalue_label: List.
A list containing specific pvalue labels. This order must match the length of pairs list.
:param palette: String or List.
Color palette used for the plot. Can be given as common color name or in hex code.
:param orient: String
Orientation of the plot. Only "v" and "h" are for vertical and horizontal, respectively, is supported
:param loc: String
Set location of annotations. Only "inside" or "outside" are supported.
:param ci: String
Set confidence interval on plot.
:param capsize: Int
Set cap size on plot.
:param return_df: Boolean
Returns a DataFrame of calculated results. If pairs used, only return rows with annotated pairs.
:return:
"""
# separate kwargs for sns and sns
valid_sns = utils.get_kwargs(sns.lineplot)
valid_annot = utils.get_kwargs(Annotator)
sns_kwargs = {key: value for key, value in kwargs.items() if key in valid_sns}
annot_kwargs = {key: value for key, value in kwargs.items() if key in valid_annot}
# Perform Stat calculations and get pairs and pvalue for annotation
pairs, pvalue_plot, stat_df_result = Plots._get_test(self, group_col, kwargs, pairs, subgroup_col, subject_col,
within_subject_col, test, value_col)
# Set kwargs dictionary for line annotations
annotate_kwargs = {}
if "line_offset_to_group" in kwargs and "line_offset" in kwargs:
# Get kwargs from input
line_offset_to_group = kwargs["line_offset_to_group"]
line_offset = kwargs["line_offset"]
# Add to dictionary
annotate_kwargs["line_offset_to_group"] = line_offset_to_group
annotate_kwargs["line_offset"] = line_offset
# Set order for groups on plot
if group_order:
group_order = group_order
# set orientation for plot and Annotator
orient = orient.lower()
if orient == "v":
x_input = group_col
y_input = value_col
elif orient == "h":
x_input = value_col
y_input = group_col
else:
raise ValueError("Orientation must be 'v' or 'h'!")
# set optional subgroup_col
if subgroup_col:
subgroup_hue = subgroup_col
else:
subgroup_hue = group_col
# plot
# ci and capsize for errorbar
ax = sns.lineplot(data=self.data, x=x_input, y=y_input, order=group_order, palette=palette, hue=subgroup_hue,
ci=ci, capsize=capsize, **sns_kwargs)
annotator = Annotator(ax, pairs=pairs, data=self.data, x=x_input, y=y_input, order=group_order, verbose=False,
orient=orient, hue=subgroup_hue, **annot_kwargs)
# optional input for custom annotations
if pvalue_label:
pvalue_plot = pvalue_label
# Location of annotations
if loc not in ["inside", "outside"]:
raise ValueError("Invalid loc! Only 'inside' or 'outside' are accepted!")
if loc == "inside":
annotator.configure(loc=loc, test=None)
else:
annotator.configure(loc=loc, test=None)
# Make sure the pairs and pvalue lists match
if len(pairs) != len(pvalue_plot):
raise Exception("pairs and pvalue_order length does not match!")
else:
annotator.set_custom_annotations(pvalue_plot)
annotator.annotate(**annotate_kwargs)
# Adjust title and title titlesize from kwargs
if "title" in kwargs:
plt.title(kwargs["title"])
if "title" and "titlesize" in kwargs:
plt.title(kwargs["title"], titlesize=kwargs["titlesize"])
plt.tight_layout()
# Return DataFrame AND figure
if return_df:
return stat_df_result, annotator
return annotator
[docs]
def p_matrix(self, data=None, cmap=None, title=None, titlesize=14, linewidths=0.01, linecolor="gray", **kwargs):
"""
Wrapper function for scikit_posthoc heatmap.
:param data: Pandas.Dataframe
Input table must be a matrix calculated using the stats.get_p_matrix(). Optional.
:param cmap: List
A list of colors. Can be color names or hex codes.
:param title: String
Input title for figure.
:param title_titlesize: Int
Set size of figure legend.
:param linewidths: Int
Set line width of figure.
:param linecolor: String
Set line color. Can be color name or hex code.
:param kwargs: Optional
Keyword arguemnts associated with [scikit-posthocs](https://scikit-posthocs.readthedocs.io/en/latest/)
:return: Pyplot figure
"""
if data is None:
data = self.data
if title:
plt.title(title, fontsize=titlesize)
if cmap is None:
# cmap list for 1, NS, 0.001, 0.01, 0.05
cmap = ["1", "#fbd7d4", "#005a32", "#238b45", "#a1d99b"]
fig = sp.sign_plot(data, cmap=cmap, linewidths=linewidths, linecolor=linecolor, **kwargs)
else:
fig = sp.sign_plot(data, cmap=cmap, linewidths=linewidths, linecolor=linecolor, **kwargs)
# Display plot
return fig
"""
Functions to plot data distribution
"""
[docs]
def distribution(self, val_col=None, type="histplot", **kwargs):
"""
:param self: Pandas.Dataframe
Input data.
:param val_col: String
The name of the column containing the dependent variable.
:param type: String
The type of figure drawn. For distribution, only "histplot" or "qqplot" supported
:param kwargs: Optional
keyword arguments for seaborn or pg.qqplot.
:return: figure
"""
# Incorporate params from sns.histplot and pg.qq
valid_hist = utils.get_kwargs(sns.histplot)
valid_qq = utils.get_kwargs(pg.qqplot)
hist_kwargs = {key: value for key, value in kwargs.items() if key in valid_hist}
qq_kwargs = {key: value for key, value in kwargs.items() if key in valid_qq}
if type == "histplot":
fig = sns.histplot(data=self.data, x=val_col, **hist_kwargs)
elif type == "qqplot":
fig = pg.qqplot(self.data[val_col], dist="norm", **qq_kwargs)
else:
raise ValueError("For test parameter, only 'histplot' or 'qqplot' available")
return fig
def _get_test(self, group_col, kwargs, pairs, subgroup_col, subject_col, within_subject_col, test, value_col):
"""
Function to obtain the pvalues and pairs for annotating the plot.
:param self: self.df
:param group_col: group_col
:param kwargs: kwargs
:param pairs: pairs
:param subgroup_col: subgroup_col
:param test: test
:param value_col: value_col
:return:
"""
global stat_df, pvalue
# Check input test and run calculation
if test == "tukey":
# Get kwargs for pingouin
valid_pg = utils.get_kwargs(pg.pairwise_tukey)
pg_kwargs = {key: value for key, value in kwargs.items() if key in valid_pg}
stat_df = Stats(self.data).get_tukey(value_col=value_col, group_col=group_col, **pg_kwargs)
"""Get pvalue and pairs from table"""
# result_df has removed rows with n.s. This is only needed if plot has specific pairs input
stat_df = _get_pair_subgroup(stat_df, hue=pairs)
pvalue = [utils.star_value(value) for value in stat_df.p_tukey.tolist()]
pairs = [(a, b) for a, b in zip(stat_df["A"], stat_df["B"])]
elif test == "gameshowell":
# Get kwargs for pingouin
valid_pg = utils.get_kwargs(pg.pairwise_gameshowell)
pg_kwargs = {key: value for key, value in kwargs.items() if key in valid_pg}
stat_df = Stats(self.data).get_gameshowell(value_col=value_col, group_col=group_col, **pg_kwargs)
"""Get pvalue and pairs from table"""
# result_df has removed rows with n.s. This is only needed if plot has specific pairs input
stat_df = _get_pair_subgroup(stat_df, hue=pairs)
pvalue = [utils.star_value(value) for value in stat_df.pval.tolist()]
pairs = [(a, b) for a, b in zip(stat_df["A"], stat_df["B"])]
elif test == "pairwise-rm":
# Get kwargs for pingouin
valid_pg = utils.get_kwargs(pg.pairwise_tests)
pg_kwargs = {key: value for key, value in kwargs.items() if key in valid_pg}
# print(pg_kwargs)
stat_df = Stats(self.data).get_pairwise_rm(value_col=value_col, group_col=group_col,
within_subject_col=within_subject_col, subject_col=subject_col,
parametric=True, **pg_kwargs)
"""Get pvalue and pairs from table"""
# result_df has removed rows with n.s. This is only needed if plot has specific pairs input
stat_df = _get_pair_subgroup(stat_df, hue=pairs)
pvalue = [utils.star_value(value) for value in stat_df.p_unc.tolist()]
pairs = [(a, b) for a, b in zip(stat_df["A"], stat_df["B"])]
elif test == "pairwise-mixed":
# Get kwargs for pingouin
valid_pg = utils.get_kwargs(pg.pairwise_tests)
pg_kwargs = {key: value for key, value in kwargs.items() if key in valid_pg}
stat_df = Stats(self.data).get_pairwise_mixed(value_col=value_col, group_col=group_col,
within_subject_col=within_subject_col,
subgroup_col=subject_col, parametric=True, **pg_kwargs)
"""Get pvalue and pairs from table"""
# result_df has removed rows with n.s. This is only needed if plot has specific pairs input
stat_df = _get_pair_subgroup(stat_df, hue=pairs)
pvalue = [utils.star_value(value) for value in stat_df.p_unc.tolist()]
pairs = [(a, b) for a, b in zip(stat_df["A"], stat_df["B"])]
elif test == "pairwise-nonparametric":
# Get kwargs for pingouin
valid_pg = utils.get_kwargs(pg.pairwise_tests)
pg_kwargs = {key: value for key, value in kwargs.items() if key in valid_pg}
stat_df = Stats(self.data).get_pairwise_tests(value_col=value_col, group_col=group_col,
within_subject_col=within_subject_col,
subject_col=subject_col, parametric=False, **pg_kwargs)
"""Get pvalue and pairs from table"""
# result_df has removed rows with n.s. This is only needed if plot has specific pairs input
stat_df = _get_pair_subgroup(stat_df, hue=pairs)
pvalue = [utils.star_value(value) for value in stat_df.p_unc.tolist()]
pairs = [(a, b) for a, b in zip(stat_df["A"], stat_df["B"])]
elif test == "pairwise-parametric":
# Get kwargs for pingouin
valid_pg = utils.get_kwargs(pg.pairwise_tests)
pg_kwargs = {key: value for key, value in kwargs.items() if key in valid_pg}
stat_df = Stats(self.data).get_pairwise_tests(value_col=value_col, group_col=group_col,
within_subject_col=within_subject_col,
subject_col=subgroup_col, parametric=True, **pg_kwargs)
"""Get pvalue and pairs from table"""
# result_df has removed rows with n.s. This is only needed if plot has specific pairs input
stat_df = _get_pair_subgroup(stat_df, hue=pairs)
pvalue = [utils.star_value(value) for value in stat_df.p_unc.tolist()]
pairs = [(a, b) for a, b in zip(stat_df["A"], stat_df["B"])]
elif test == "wilcoxon":
# Get kwargs for pingouin
valid_pg = utils.get_kwargs(pg.wilcoxon)
pg_kwargs = {key: value for key, value in kwargs.items() if key in valid_pg}
stat_df = Stats(self.data).get_wilcoxon(value_col=value_col, group_col=group_col, subgroup_col=subgroup_col,
**pg_kwargs)
"""Get pvalue and pairs from table"""
# result_df has removed rows with n.s. This is only needed if plot has specific pairs input
stat_df = _get_pair_subgroup(stat_df, hue=pairs)
pvalue = [utils.star_value(value) for value in stat_df["p-val"].tolist()]
pairs = [(a, b) for a, b in zip(stat_df["A"], stat_df["B"])]
elif test == "mannu":
# Get kwargs for pingouin
valid_pg = utils.get_kwargs(pg.mwu)
pg_kwargs = {key: value for key, value in kwargs.items() if key in valid_pg}
stat_df = Stats(self.data).get_mannu(value_col=value_col, group_col=group_col, subgroup_col=subgroup_col,
alternative="two-sided", **pg_kwargs)
"""Get pvalue and pairs from table"""
# result_df has removed rows with n.s. This is only needed if plot has specific pairs input
stat_df = _get_pair_subgroup(stat_df, hue=pairs)
pvalue = [utils.star_value(value) for value in stat_df["p-val"].tolist()]
pairs = [(a, b) for a, b in zip(stat_df["A"], stat_df["B"])]
elif test == "kruskal":
# Get kwargs for pingouin
valid_pg = utils.get_kwargs(pg.kruskal)
pg_kwargs = {key: value for key, value in kwargs.items() if key in valid_pg}
stat_df = Stats(self.data).get_kruskal(value_col=value_col, group_col=group_col, **pg_kwargs)
"""Get pvalue and pairs from table"""
# result_df has removed rows with n.s. This is only needed if plot has specific pairs input
stat_df = _get_pair_subgroup(stat_df, hue=pairs)
pvalue = [utils.star_value(value) for value in stat_df.p_unc.tolist()]
pairs = [(a, b) for a, b in zip(stat_df["A"], stat_df["B"])]
else:
print(f"Plotting not supported for {test}!")
return pairs, pvalue, stat_df
def _get_pair_subgroup(df, hue=None):
"""Generate pairs by group_col and hue. Hue will designate which input rows to keep for plotting."""
if hue is None:
hue = _get_pairs(df)
# Convert filter_values to a set of tuples. Both directions are generated for checking df pairs.
forward_set = {tuple(x) for x in hue}
reverse_set = {(y, x) for (x, y) in forward_set}
# Combine columns A and B into a single column of tuples
df["AB"] = list(zip(df["A"], df["B"]))
# Filtering DataFrame based on filter values
filtered_df = (df[df["AB"].isin(forward_set) | df["AB"].isin(reverse_set)].copy().reset_index(drop=True))
# Make pairs between groups and subgroups by df
filtered_df = _sort_df(filtered_df, hue)
# Drop the combined column AB if not needed in the final output
filtered_df.drop("AB", axis=1, inplace=True)
return filtered_df
def _get_pairs(df):
# Support function to make pairs form dataframe into a list of tuples
pairs = [(a, b) for a, b in zip(df["A"], df["B"])]
return pairs
# Custom sorting function
def _pair_sort(list_order, row):
# Support function to make pairs between groups and subgroups by df
try:
# Check both possible orders of the tuple
index = list_order.index((row["A"], row["B"]))
except ValueError:
try:
index = list_order.index((row["B"], row["A"]))
except ValueError:
# If the row tuple is not found in the desired_order list, assign a high index
index = len(list_order)
return index
# Sort the DataFrame based on the custom sorting function
def _sort_df(df, list_order):
# Support function to make pairs between groups and subgroups by df
sorted_indices = df.apply(lambda row: _pair_sort(list_order, row), axis=1)
return df.iloc[sorted_indices.argsort()]
# support function for cases where hide_ns is true, and nothing in plot is significant
def _option_to_hide_ns(hide_ns: bool, pairs: Union[list[tuple[Any, Any]], Any], pvalue_plot: list[str]):
# If set to True, only show plots with significance
if hide_ns:
# Filter n.s. from pvalue and pairs
hidden_sigfig_data = [(item1, item2) for item1, item2 in zip(pvalue_plot, pairs) if item1 != "n.s."]
if hidden_sigfig_data:
# Unzip the filtered data into pvalue and pairs variables
pvalue_plot, pairs = zip(*hidden_sigfig_data)
data_is_sig = True
else:
data_is_sig = False
warnings.warn("No Significant Values found after filtering. Plot drawn without annotations.")
else:
data_is_sig = True # to track sigfig for annotations
return pairs, pvalue_plot, data_is_sig
if __name__ == "__main__":
import doctest
doctest.testmod()