Source code for py50.stats

"""
Script to calculate statistics.
"""

from typing import Optional, Union, List, Any
import pandas as pd
from itertools import combinations
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import scikit_posthocs as sp
import pingouin as pg
from statannotations.Annotator import (
    Annotator,
)  # replace with statannotations in future
from py50 import utils
import warnings

__all__ = ["Stats", "Plots"]

sns.set_style("ticks")


[docs] class Stats: """ Class contains wrappers for pingouin module. The functions output data as a Pandas DataFrame. This is in a format needed for plotting with functions in class Plots(), however they can also be used individually to output single DataFrame for output as a csv or xlsx file using pandas. """
[docs] def __init__(self, data): if not isinstance(data, pd.DataFrame): raise ValueError("Input must be a DataFrame") self.data = data
[docs] def show(self, rows=None): """ show DataFrame :param rows: Int Indicate the number of rows to display. If none, automatically show 5. :return: DataFrame """ returned_df = self.data if rows is None: # print("rows is none") # for troubleshooting return returned_df.head() elif isinstance(rows, int): # print("rows are given!") # for troubleshooting return returned_df.head(rows)
[docs] def get_normality(self, value_col=None, group_col=None, method="shapiro", **kwargs): """ Test data normality of dataset. :param value_col: String Name of column containing the dependent variable. :param group_col: String Name of columnName of column containing the grouping variable. :param method: String Normality test. ‘shapiro’ (default). Additional tests can be found with [pingouin.normality()](https://pingouin-stats.org/build/html/generated/pingouin.normality.html) :param kwargs: optional Other options available with pingouin.normality() :return: Pandas.DataFrame """ result_df = pg.normality( data=self.data, dv=value_col, group=group_col, method=method, **kwargs ) return result_df
[docs] def get_homoscedasticity(self, value_col=None, group_col=None, method="levene", **kwargs): """ Test for data variance. :param value_col: String Name of column containing the dependent variable. :param group_col: String Name of columnName of column containing the grouping variable. :param method: String Statistical test. ‘levene’ (default). Additional tests can be found with [pingouin.homoscedasticity()](https://pingouin-stats.org/build/html/generated/pingouin.homoscedasticity.html#pingouin.homoscedasticity) :param kwargs: optional Other options available with pingouin.homoscedasticity() :return: Pandas.DataFrame """ result_df = pg.homoscedasticity( data=self.data, dv=value_col, group=group_col, method=method, **kwargs ) return result_df
""" Parametric posts below """
[docs] def get_anova(self, value_col=None, group_col=None, **kwargs): """ One-way and N-way ANOVA. :param value_col: String Name of column containing the dependent variable. :param group_col: String or list of strings Name of columnName of column containing the grouping variable. :param kwargs: optional Other options available with [pingouin.anova()](https://pingouin-stats.org/build/html/generated/pingouin.anova.html) :return: Pandas.DataFrame """ result_df = pg.anova(data=self.data, dv=value_col, between=group_col, **kwargs) # Add significance asterisk pvalue_result = [utils.star_value(value) for value in result_df.p_unc] result_df["significance"] = pvalue_result return result_df
[docs] def get_welch_anova(self, value_col=None, group_col=None): """ One-way Welch ANOVA :param value_col: String Name of column containing the dependent variable. :param group_col: String Name of column containing the grouping variable. :return: Pandas.DataFrame """ result_df = pg.welch_anova(data=self.data, dv=value_col, between=group_col) # Add significance asterisk pvalue_result = [utils.star_value(value) for value in result_df.p_unc] result_df["significance"] = pvalue_result return result_df
[docs] def get_rm_anova(self, value_col=None, within_subject_col=None, subject_col=None, correction="auto", detailed=False, effsize="ng2"): """ One-way and two-way repeated measures ANOVA. :param value_col: String Name of column containing the dependent variable. :param within_subject_col: String Name of column containing the within factor. :param subject_col: String Name of column containing the subject identifier. :param correction: String or Boolean If True, also return the Greenhouse-Geisser corrected p-value. :param detailed: Boolean If True, return full ANOVA table. :param effsize: String Effect size. :return: Pandas.DataFrame """ result_df = pg.rm_anova(data=self.data, dv=value_col, within=within_subject_col, subject=subject_col, correction=correction, detailed=detailed, effsize=effsize) # Add significance asterisk pvalue_result = [utils.star_value(value) for value in result_df.p_unc] result_df["significance"] = pvalue_result return result_df
[docs] def get_mixed_anova(self, value_col=None, group_col=None, within_subject_col=None, subject_col=None, **kwargs): """ Mixed-design ANOVA. :param value_col: String Name of column containing the dependent variable. :param group_col: String Name of column containing the between factor. :param within_subject_col: String Name of column containing the within-subject factor (repeated measurements). :param subject_col: Name of column containing the between-subject identifier. :param kwargs: optional Other options available with [pingouin.mixed_anova()](https://pingouin-stats.org/build/html/generated/pingouin.mixed_anova.html) :return: Pandas.DataFrame """ result_df = pg.mixed_anova(data=self.data, dv=value_col, between=group_col, within=within_subject_col, subject=subject_col, **kwargs) # Add significance asterisk pvalue_result = [utils.star_value(value) for value in result_df.p_unc] result_df["significance"] = pvalue_result return result_df
[docs] def get_tukey(self, value_col=None, group_col=None, effsize="hedges"): """ Pairwise Tukey post-hoc test. :param value_col: String Name of column containing the dependent variable. :param group_col: String Name of columnName of column containing the between factor. :param effsize: String or None Effect size. Additional methods can be found with [pingouin.pairwise_tukey()](https://pingouin-stats.org/build/html/generated/pingouin.pairwise_tukey.html) :return: Pandas.DataFrame """ result_df = pg.pairwise_tukey( data=self.data, dv=value_col, between=group_col, effsize=effsize ) # Add significance asterisk pvalue_result = [utils.star_value(value) for value in result_df.p_tukey] result_df["significance"] = pvalue_result return result_df
[docs] def get_gameshowell(self, value_col=None, group_col=None, effsize="hedges"): """ Pairwise Games-Howell post-hoc test :param value_col: String Name of column containing the dependent variable. :param group_col: String Name of columnName of column containing the between factor. :param effsize: String or None Effect size. Additional methods can be found with [pingouin.pairwise_gameshowell()](https://pingouin-stats.org/build/html/generated/pingouin.pairwise_gameshowell.html) :return: Pandas.DataFrame """ result_df = pg.pairwise_gameshowell(data=self.data, dv=value_col, between=group_col, effsize=effsize) # Add significance asterisk pvalue_result = [utils.star_value(value) for value in result_df.pval] result_df["significance"] = pvalue_result return result_df
""" non-parametric tests below """
[docs] def get_wilcoxon(self, value_col=None, group_col=None, subgroup_col=None, alternative="two-sided", **kwargs): """ Calculate wilcoxon tests. This is non-parametric version of paired T-test. Data number must be uniform to work. :param value_col: String Columns containing values for testing. :param group_col: String Column containing group name. :param subgroup_col: String Column containing subgroup name. :param alternative: String Defines the alternative hypothesis, or tail of the test. Must be one of “two-sided”. Must be one of “two-sided” (default), “greater” or “less”. :param kwargs: Optional Other options available with [pingouin.wilcoxon()](https://pingouin-stats.org/build/html/generated/pingouin.wilcoxon.html) :return: Pandas.DataFrame """ # ignore Wilcoxon warnings warnings.filterwarnings("ignore", message="Exact p-value calculation does not work if there are zeros.*") if subgroup_col: # Convert 'Name' and 'Status' columns to string self.data[group_col] = self.data[group_col].astype(str) self.data[subgroup_col] = self.data[subgroup_col].astype(str) self.data["subgroup"] = self.data[group_col] + "-" + self.data[subgroup_col] subgroup_list = self.data["subgroup"].unique().tolist() subgroup_df = self.data[self.data["subgroup"].isin(subgroup_list)].copy() # Get unique pairs between group and subgroup group = subgroup_df["subgroup"].unique() # From unique items in group list, generate pairs pairs = list(combinations(group, 2)) results_list = [] for pair in pairs: # Get items from pair list and split by hyphen group1, subgroup1 = pair[0].split("-", 1) group2, subgroup2 = pair[1].split("-", 1) # # For troubleshooting # print("first:", data[(data[group_col] == group1)][value_col].shape) # print("second:", data[(data[group_col] == group2)][value_col].shape) # Check length of groups group1_length = self.data[self.data[group_col] == group1][value_col] group2_length = self.data[self.data[group_col] == group2][value_col] # print(len(group1_length), len(group2_length)) # For troubleshooting if len(group1_length) != len(group2_length): raise ValueError( "The lengths of the groups in group_col are not equal!" ) # Perform Wilcoxon signed-rank test result = pg.wilcoxon( self.data[(self.data[group_col] == group1) & (self.data[subgroup_col] == subgroup1)][value_col], self.data[(self.data[group_col] == group2) & (self.data[subgroup_col] == subgroup2)][value_col], alternative=alternative, **kwargs) # Convert significance by pvalue pvalue_output = [utils.star_value(value) for value in result.p_val] # Store the results in the list results_list.append( { "A": f"{group1}-{subgroup1}", "B": f"{group2}-{subgroup2}", "W-val": result.W_val.iloc[0], "p-val": result.p_val.iloc[0], "significance": pvalue_output[0], "RBC": result.RBC.iloc[0], "CLES": result.CLES.iloc[0], } ) # Convert the list of dictionaries to a DataFrame result_df = pd.DataFrame(results_list) # Split values into and separate by comma result_df["A"] = result_df.A.apply(lambda x: tuple(x.split("-", 1))) result_df["B"] = result_df.B.apply(lambda x: tuple(x.split("-", 1))) return result_df else: """ No subgroups found. Tests single group and values. """ # Get unique pairs from group group = self.data[group_col].unique() # From unique items in group list, generate pairs pairs = list(combinations(group, 2)) results_list = [] for pair in pairs: # Get items from pair list and split by hyphen group1 = pair[0] group2 = pair[1] # # For troubleshooting # print("first:", data[(data[group_col] == group1)][value_col].shape) # print("second:", data[(data[group_col] == group2)][value_col].shape) # Check length of groups group1_length = self.data[self.data[group_col] == group1][value_col] group2_length = self.data[self.data[group_col] == group2][value_col] # print(len(group1_length), len(group2_length)) # For troubleshooting if len(group1_length) != len(group2_length): raise ValueError( "The lengths of the groups in group_col are not equal!" ) # Perform wilcoxon result = pg.wilcoxon( self.data[(self.data[group_col] == group1)][value_col], self.data[(self.data[group_col] == group2)][value_col], alternative=alternative, **kwargs, ) pvalue_output = [utils.star_value(value) for value in result.p_val] results_list.append( { "A": group1, "B": group2, "W-val": result.W_val.iloc[0], "p-val": result.p_val.iloc[0], "significance": pvalue_output[0], "RBC": result.RBC.iloc[0], "CLES": result.CLES.iloc[0], } ) # Convert the list of dictionaries to a DataFrame result_df = pd.DataFrame(results_list) # Add significance asterisk pvalue_output = [utils.star_value(value) for value in result_df["p-val"]] result_df["significance"] = pvalue_output return result_df
[docs] def get_mannu(self, value_col=None, group_col=None, subgroup_col=None, alternative="two-sided", **kwargs): """ Calculate Mann-Whitney U Test. This is a non-parametric version of the independent T-test. :param self: pandas.DataFrame Input DataFrame. :param value_col: String Columns containing values for testing. :param group_col: String Column containing group name. :param subgroup_col: String Column containing subgroup name. :param alternative: String Defines the alternative hypothesis, or tail of the test. Must be one of “two-sided”. Must be one of “two-sided” (default), “greater” or “less”. :param kwargs: Optional Other options available with [pingouin.mwu()](https://pingouin-stats.org/build/html/generated/pingouin.mwu.html) :return: Pandas.DataFrame """ if subgroup_col: # Convert 'Name' and 'Status' columns to string self.data[group_col] = self.data[group_col].astype(str) self.data[subgroup_col] = self.data[subgroup_col].astype(str) self.data["subgroup"] = self.data[group_col] + "-" + self.data[subgroup_col] subgroup_list = self.data["subgroup"].unique().tolist() subgroup_df = self.data[self.data["subgroup"].isin(subgroup_list)].copy() # Get unique pairs between group and subgroup group = subgroup_df["subgroup"].unique() # From unique items in group list, generate pairs pairs = list(combinations(group, 2)) # Check to ensure right columns selected if self.data[group_col].dtype != "object": raise ValueError(f"Is group_col: '{group_col}' strings?") elif self.data[subgroup_col].dtype != "object": raise ValueError(f"Is subgroup_col: '{subgroup_col}' strings?") elif self.data[value_col].dtype == "object": raise ValueError(f"Is value_col: '{value_col}' should be numerical?") results_list = [] for pair in pairs: # print('this is the pair:', pair) # for troubleshooting # print('this is the pairs:', pairs) # Get items from pair list and split by hyphen group1, subgroup1 = pair[0].split("-", 1) group2, subgroup2 = pair[1].split("-", 1) # Perform mwu result = pg.mwu( self.data[(self.data[group_col] == group1) & (self.data[subgroup_col] == subgroup1)][value_col], self.data[(self.data[group_col] == group2) & (self.data[subgroup_col] == subgroup2)][value_col], alternative=alternative, **kwargs) # Convert significance by pvalue pvalue_output = [utils.star_value(value) for value in result.p_val] # Store the results in the list results_list.append( { "A": f"{group1}-{subgroup1}", "B": f"{group2}-{subgroup2}", "U-val": result.U_val.iloc[0], "p-val": result.p_val.iloc[0], "significance": pvalue_output[0], "RBC": result.RBC.iloc[0], "CLES": result.CLES.iloc[0], } ) # Convert the list of dictionaries to a DataFrame df = pd.DataFrame(results_list) # Split values into and separate by comma df["A"] = df["A"].apply(lambda x: tuple(x.split("-", 1))) df["B"] = df["B"].apply(lambda x: tuple(x.split("-", 1))) return df else: """ No subgroups found. Tests single group and values. """ # Get unique pairs from group group = self.data[group_col].unique() # From unique items in group list, generate pairs pairs = list(combinations(group, 2)) results_list = [] for pair in pairs: # Get items from pair list and split by hyphen group1 = pair[0] group2 = pair[1] # Perform mwu result = pg.mwu( self.data[(self.data[group_col] == group1)][value_col], self.data[(self.data[group_col] == group2)][value_col], alternative=alternative, **kwargs, ) pvalue_output = [utils.star_value(value) for value in result.p_val] results_list.append( { "A": group1, "B": group2, "U-val": result.U_val.iloc[0], "p-val": result.p_val.iloc[0], "significance": pvalue_output[0], "RBC": result.RBC.iloc[0], "CLES": result.CLES.iloc[0], } ) # Convert the list of dictionaries to a DataFrame df = pd.DataFrame(results_list) return df
[docs] def get_kruskal(self, value_col=None, group_col=None, detailed=False): """ Calculate Kruskal-Wallis H-test for independent samples. :param value_col: String Name of column containing the dependent variable. :param group_col: String Name of column containing the between factor. :param detailed: Boolean Output additional details from Kruskal-Wallis H-test. :return: Pandas.DataFrame """ result_df = pg.kruskal( data=self.data, dv=value_col, between=group_col, detailed=detailed ) # Add significance asterisk pvalue_output = [utils.star_value(value) for value in result_df.p_unc] result_df["significance"] = pvalue_output return result_df
[docs] def get_cochran(self, value_col=None, group_col=None, subgroup_col=None): """ Calculate Cochran Q Test. This is used when the dependent variable, or value_col, is binary. For details between groups, posthoc test will be needed. :param value_col: String Name of column containing the dependent variable. :param group_col: String Name of column containing the within factor. :param subgroup_col: String Name of column containing the subject identifier. :return: Pandas.DataFrame """ if subgroup_col: result_df = pg.cochran(data=self.data, dv=value_col, within=subgroup_col, subject=group_col) else: result_df = pg.cochran(data=self.data, dv=value_col, within=group_col) # Add significance asterisk pvalue_output = [utils.star_value(value) for value in result_df.p_unc] result_df["significance"] = pvalue_output return result_df
[docs] def get_friedman(self, group_col=None, value_col=None, subgroup_col=None, method="chisq"): """ Calculate Friedman Test. Determines if distributions of two or more paired samples are equal. For details between groups, posthoc test (get_pairwise_tests(parametric=False)) will be needed. :param value_col: String Name of column containing the dependent variable :param group_col: String Name of column containing the between-subject factor. :param subgroup_col: String Name of column containing the subject/rater identifier :param method: String Statistical test to perform. Must be 'chisq' (chi-square test) or 'f' (F test). See Pingouin documentation for further details :return: Pandas.DataFrame """ # Raise error if subgroup_col not given if subgroup_col is None: raise ValueError("Friedman test must be in long format and requires a subgroup_col as subject") result_df = pg.friedman(data=self.data, dv=value_col, within=group_col, subject=subgroup_col, method=method) # Add significance asterisk pvalue_output = [utils.star_value(value) for value in result_df.p_unc] result_df["significance"] = pvalue_output return result_df
""" pairwise t-tests below """
[docs] def get_pairwise_tests(self, value_col=None, group_col=None, within_subject_col=None, subject_col=None, parametric=True, **kwargs): """ Posthoc test for parametric or nonparametric statistics. By default, the parametric parameter is set as True. :param value_col: String Name of column containing the dependent variable. :param group_col: String or list with 2 elements Name of column containing the between-subject factors. :param within_subject_col: String or list with 2 elements Name of column containing the within-subject identifier. :param subject_col: String Name of column containing the subject identifier. This is mandatory if subgroup_col is used. :param parametric: Boolean If True (default), use the parametric ttest() function. If False, use [pingouin.wilcoxon()](https://pingouin-stats.org/build/html/generated/pingouin.wilcoxon.html#pingouin.wilcoxon) or [pingouin.mwu()](https://pingouin-stats.org/build/html/generated/pingouin.mwu.html#pingouin.mwu) for paired or unpaired samples, respectively. :param kwargs: dict Additional keywords arguments that are passed to [pingouin.pairwise_tests()](https://pingouin-stats.org/build/html/generated/pingouin.pairwise_tests.html#pingouin.pairwise_tests). :return: pandas.DataFrame """ result_df = pg.pairwise_tests(data=self.data, dv=value_col, between=group_col, within=within_subject_col, subject=subject_col, parametric=parametric, **kwargs) # Add significance asterisk pvalue_output = [utils.star_value(value) for value in result_df.p_unc] result_df["significance"] = pvalue_output return result_df
[docs] def get_pairwise_rm(self, value_col=None, group_col=None, within_subject_col=None, subject_col=None, parametric=True, **kwargs, ): """ Posthoc test for repeated measures. :param value_col: String Name of column containing the dependent variable. :param group_col: String or list with 2 elements Name of column containing the between-subject factors. :param within_subject_col: String or list with 2 elements Name of column containing the within-subject identifier. :param subject_col: String Name of column containing the subject identifier. This is mandatory if subgroup_col is used. :param parametric: Boolean If True (default), use the parametric ttest() function. If False, use [pingouin.wilcoxon()](https://pingouin-stats.org/build/html/generated/pingouin.wilcoxon.html#pingouin.wilcoxon) or [pingouin.mwu()](https://pingouin-stats.org/build/html/generated/pingouin.mwu.html#pingouin.mwu) for paired or unpaired samples, respectively. :param kwargs: dict Additional keywords arguments that are passed to [pingouin.pairwise_tests()](https://pingouin-stats.org/build/html/generated/pingouin.pairwise_tests.html#pingouin.pairwise_tests). :return: pandas.DataFrame """ result_df = pg.pairwise_tests(data=self.data, dv=value_col, between=group_col, within=within_subject_col, subject=subject_col, parametric=parametric, **kwargs) # Add significance asterisk pvalue_output = [utils.star_value(value) for value in result_df.p_unc] result_df["significance"] = pvalue_output return result_df
[docs] def get_pairwise_mixed(self, value_col=None, group_col=None, within_subject_col=None, subject_col=None, parametric=True, **kwargs): """ Posthoc test for mixed ANOVA. :param value_col: String Name of column containing the dependent variable. :param group_col: String or list with 2 elements Name of column containing the between-subject factors. :param within_subject_col: String or list with 2 elements Name of column containing the within-subject identifier. :param subject_col: String Name of column containing the subject identifier. This is mandatory if subgroup_col is used. :param parametric: Boolean If True (default), use the parametric ttest() function. If False, use [pingouin.wilcoxon()](https://pingouin-stats.org/build/html/generated/pingouin.wilcoxon.html#pingouin.wilcoxon) or [pingouin.mwu()](https://pingouin-stats.org/build/html/generated/pingouin.mwu.html#pingouin.mwu) for paired or unpaired samples, respectively. :param kwargs: dict Additional keywords arguments that are passed to [pingouin.pairwise_tests()](https://pingouin-stats.org/build/html/generated/pingouin.pairwise_tests.html#pingouin.pairwise_tests). :return: pandas.DataFrame """ result_df = pg.pairwise_tests(data=self.data, dv=value_col, between=group_col, within=within_subject_col, subject=subject_col, parametric=parametric, **kwargs) # Add significance asterisk pvalue_output = [utils.star_value(value) for value in result_df.p_unc] result_df["significance"] = pvalue_output return result_df
""" Output P-Values as a matrix in Pandas DataFrame """
[docs] @staticmethod def get_p_matrix(data, test=None, group_col1=None, group_col2=None, order=None): """ Convert dataframe of statistic results into a matrix. Group columns must be indicated. Group 2 is optional and depends on test used (i.e. pairwise vs Mann-Whitney U). Final DataFrame output can be used with the Plots.p_matrix() function to generate a heatmap of p-values. :param data: pandas.DataFrame Input DataFrame. Must be of already computed test results. :param group_col1: String Name of column containing the group :param group_col2: String Name of column containing the second group. This variable is optional. :param test: String Name of the test used to calculate statistics. :param order: List or String == "alpha" Reorder the groups for the final table. If input is string "alpha", the order of the groups will be alphabetized. :return: """ matrix_df = utils.multi_group(data, group_col1, group_col2, test, order) return matrix_df
""" Function to detail significance column meaning """
[docs] @staticmethod def explain_significance(): """ Print out DataFrame containing explanations for star values. This is used for reference. See [GraphPad](https://www.graphpad.com/support/faq/what-is-the-meaning-of--or--or--in-reports-of-statistical-significance-from-prism-or-instat/) :return: pandas.DataFrame """ df = pd.DataFrame( { "pvalue": [ "p > 0.05", "p ≤ 0.05", " p ≤ 0.01", "p ≤ 0.001", "p ≤ 0.0001", ], "p_value": ["No Significance (n.s.)", "*", "**", "***", "****"], } ) return df
[docs] class Plots(Stats):
[docs] def __init__(self, data): super().__init__(data)
[docs] @staticmethod def list_test(): """ List all tests available for plotting :return: """ print( "List of tests available for plotting: 'tukey', 'gameshowell', 'pairwise-parametric', 'pairwise-rm', 'pairwise-mixed', 'pairwise-nonparametric', 'wilcoxon', 'mannu', 'kruskal'" "'kruskal'" )
[docs] def boxplot(self, test=None, group_col=None, value_col=None, group_order=None, subgroup_col=None, subject_col=None, within_subject_col=None, pairs=None, pvalue_label=None, hide_ns=False, palette=None, orient="v", loc="inside", whis=1.5, return_df=None, **kwargs): """ Draw a boxplot from the input DataFrame. :param test: String Name of test for calculations. Names must match the test names from the py50.Stats() :param group_col: String Name of column containing groups. This should be the between depending on the selected test. :param value_col: String Name of the column containing the values. This is the dependent variable. :param group_order: List. Place the groups in a specific order on the plot. :param subgroup_col: String Name of the column containing the subgroup for the group column. This is associated with the hue parameters in Seaborn. :param subject_col: String Name of the column containing the subject column. :param within_subject_col: String Name of the column containing the within subject column. :param pairs: List A list containing specific pairings for annotation on the plot. :param pvalue_label: List. A list containing specific pvalue labels. This order must match the length of pairs list. :param hide_ns: bool Automatically hide groups with no significance from plot. :param palette: String or List. Color palette used for the plot. Can be given as common color name or in hex code. :param orient: String Orientation of the plot. Only "v" and "h" are for vertical and horizontal, respectively, is supported :param loc: String Set location of annotations. Only "inside" or "outside" are supported. :param whis: Int Set length of whiskers on plot. :param return_df: Boolean Returns a DataFrame of calculated results. If pairs used, only return rows with annotated pairs. :return: Fig """ # separate kwargs for sns and sns valid_sns = utils.get_kwargs(sns.boxplot) valid_annot = utils.get_kwargs(Annotator) sns_kwargs = {key: value for key, value in kwargs.items() if key in valid_sns} annot_kwargs = {key: value for key, value in kwargs.items() if key in valid_annot} # Perform Stat calculations and get pairs and pvalue for annotation pairs, pvalue_plot, stat_df_result = Plots._get_test(self, group_col, kwargs, pairs, subgroup_col, subject_col, within_subject_col, test, value_col) # Set kwargs dictionary for line annotations annotate_kwargs = {} if "line_offset_to_group" in kwargs and "line_offset" in kwargs: # Get kwargs from input line_offset_to_group = kwargs["line_offset_to_group"] line_offset = kwargs["line_offset"] # Add to dictionary annotate_kwargs["line_offset_to_group"] = line_offset_to_group annotate_kwargs["line_offset"] = line_offset # Set order for groups on plot if group_order: group_order = group_order # hide ns label if hide_ns: pairs, pvalue_plot, data_is_sig = _option_to_hide_ns(hide_ns, pairs, pvalue_plot) else: data_is_sig = True # set orientation for plot and Annotator orient = orient.lower() if orient == "v": x_input = group_col y_input = value_col elif orient == "h": x_input = value_col y_input = group_col else: raise ValueError("Orientation must be 'v' or 'h'!") # set optional subgroup_col if subgroup_col: subgroup_hue = subgroup_col else: subgroup_hue = group_col # plot ax = sns.boxplot(data=self.data, x=x_input, y=y_input, order=group_order, palette=palette, hue=subgroup_hue, whis=whis, **sns_kwargs) annotator = Annotator(ax, pairs=pairs, data=self.data, x=x_input, y=y_input, order=group_order, verbose=False, orient=orient, hue=subgroup_hue, **annot_kwargs) # Optional input to make custom labels if pvalue_label: pvalue_plot = pvalue_label # Location of annotations if loc not in ["inside", "outside"]: raise ValueError("Invalid loc! Only 'inside' or 'outside' are accepted!") if loc == "inside": annotator.configure(loc=loc, test=None) else: annotator.configure(loc=loc, test=None) # Make sure the pairs and pvalue lists match if len(pairs) != len(pvalue_plot): raise Exception("pairs and pvalue_order length does not match!") # if no significance elif not data_is_sig: pass else: annotator.set_custom_annotations(pvalue_plot) annotator.annotate(**annotate_kwargs) # Adjust title and title titlesize from kwargs title = kwargs.pop("title", None) titlesize = kwargs.pop("titlesize", None) if title: plt.title(title) if title and titlesize: plt.title(title, fontsize=titlesize) plt.tight_layout() # Return DataFrame AND figure if return_df: return stat_df_result, annotator return annotator
[docs] def barplot(self, test=None, group_col=None, value_col=None, group_order=None, subgroup_col=None, subject_col=None, within_subject_col=None, pairs=None, pvalue_label=None, hide_ns=False, palette=None, orient="v", loc="inside", errorbar="sd", capsize=0.1, return_df=None, **kwargs): """ Draw a barplot from the input DataFrame. :param test: String Name of test for calculations. Names must match the test names from the py50.Stats() :param group_col: String Name of column containing groups. This should be the between depending on the selected test. :param value_col: String Name of the column containing the values. This is the dependent variable. :param group_order: List. Place the groups in a specific order on the plot. :param subgroup_col: String Name of the column containing the subgroup for the group column. This is associated with the hue parameters in Seaborn. :param subject_col: String Name of the column containing the subject column. :param within_subject_col: String Name of the column containing the within subject column. :param pairs: List A list containing specific pairings for annotation on the plot. :param pvalue_label: List. A list containing specific pvalue labels. This order must match the length of pairs list. :param hide_ns: bool Automatically hide groups with no significance from plot. :param palette: String or List. Color palette used for the plot. Can be given as common color name or in hex code. :param orient: String Orientation of the plot. Only "v" and "h" are for vertical and horizontal, respectively, is supported :param loc: String Set location of annotations. Only "inside" or "outside" are supported. :param errorbar: String Set confidence interval on plot. :param capsize: Int Set cap size on plot. :param return_df: Boolean Returns a DataFrame of calculated results. If pairs used, only return rows with annotated pairs. :return: """ valid_sns = utils.get_kwargs(sns.barplot) valid_annot = utils.get_kwargs(Annotator) sns_kwargs = {key: value for key, value in kwargs.items() if key in valid_sns} annot_kwargs = {key: value for key, value in kwargs.items() if key in valid_annot} # Perform Stat calculations and get pairs and pvalue for annotation pairs, pvalue_plot, stat_df_result = Plots._get_test(self, group_col, kwargs, pairs, subgroup_col, subject_col, within_subject_col, test, value_col) # Set kwargs dictionary for line annotations annotate_kwargs = {} if "line_offset_to_group" in kwargs and "line_offset" in kwargs: # Get kwargs from input line_offset_to_group = kwargs["line_offset_to_group"] line_offset = kwargs["line_offset"] # Add to dictionary annotate_kwargs["line_offset_to_group"] = line_offset_to_group annotate_kwargs["line_offset"] = line_offset # Set order for groups on plot if group_order: group_order = group_order # hide ns label if hide_ns: pairs, pvalue_plot, data_is_sig = _option_to_hide_ns(hide_ns, pairs, pvalue_plot) else: data_is_sig = True # set orientation for plot and Annotator orient = orient.lower() if orient == "v": x_input = group_col y_input = value_col elif orient == "h": x_input = value_col y_input = group_col else: raise ValueError("Orientation must be 'v' or 'h'!") # set optional subgroup_col if subgroup_col: subgroup_hue = subgroup_col else: subgroup_hue = group_col # plot ax = sns.barplot(data=self.data, x=x_input, y=y_input, order=group_order, palette=palette, hue=subgroup_hue, errorbar=errorbar, capsize=capsize, **sns_kwargs) annotator = Annotator(ax, pairs=pairs, data=self.data, x=x_input, y=y_input, order=group_order, verbose=False, orient=orient, hue=subgroup_hue, **annot_kwargs) # optional input for custom annotations if pvalue_label: pvalue_plot = pvalue_label # Location of annotations if loc not in ["inside", "outside"]: raise ValueError("Invalid loc! Only 'inside' or 'outside' are accepted!") if loc == "inside": annotator.configure(loc=loc, test=None) else: annotator.configure(loc=loc, test=None) # Make sure the pairs and pvalue lists match if len(pairs) != len(pvalue_plot): raise Exception("pairs and pvalue_order length does not match!") # if no significance elif not data_is_sig: pass else: annotator.set_custom_annotations(pvalue_plot) annotator.annotate(**annotate_kwargs) # Adjust title and title titlesize from kwargs title = kwargs.pop("title", None) titlesize = kwargs.pop("titlesize", None) if title: plt.title(title) if title and titlesize: plt.title(title, fontsize=titlesize) plt.tight_layout() # Return DataFrame AND figure if return_df: return stat_df_result, annotator return annotator
[docs] def violinplot(self, test=None, group_col=None, value_col=None, group_order=None, subgroup_col=None, subject_col=None, within_subject_col=None, pairs=None, pvalue_label=None, hide_ns=False, palette=None, orient="v", loc="inside", return_df=None, **kwargs): """ Draw a violinplot from the input DataFrame. :param test: String Name of test for calculations. Names must match the test names from the py50.Stats() :param group_col: String Name of column containing groups. This should be the between depending on the selected test. :param value_col: String Name of the column containing the values. This is the dependent variable. :param group_order: List. Place the groups in a specific order on the plot. :param subgroup_col: String Name of the column containing the subgroup for the group column. This is associated with the hue parameters in Seaborn. :param subject_col: String Name of the column containing the subject column. :param within_subject_col: String Name of the column containing the within subject column. :param pairs: List A list containing specific pairings for annotation on the plot. :param pvalue_label: List. A list containing specific pvalue labels. This order must match the length of pairs list. :param hide_ns: bool Automatically hide groups with no significance from plot. :param palette: String or List. Color palette used for the plot. Can be given as common color name or in hex code. :param orient: String Orientation of the plot. Only "v" and "h" are for vertical and horizontal, respectively, is supported :param loc: String Set location of annotations. Only "inside" or "outside" are supported. :param return_df: Boolean Returns a DataFrame of calculated results. If pairs used, only return rows with annotated pairs. :return: """ # separate kwargs for sns and sns valid_sns = utils.get_kwargs(sns.violinplot) valid_annot = utils.get_kwargs(Annotator) sns_kwargs = {key: value for key, value in kwargs.items() if key in valid_sns} annot_kwargs = {key: value for key, value in kwargs.items() if key in valid_annot} # Perform Stat calculations and get pairs and pvalue for annotation pairs, pvalue_plot, stat_df_result = Plots._get_test(self, group_col, kwargs, pairs, subgroup_col, subject_col, within_subject_col, test, value_col) # Set kwargs dictionary for line annotations annotate_kwargs = {} if "line_offset_to_group" in kwargs and "line_offset" in kwargs: # Get kwargs from input line_offset_to_group = kwargs["line_offset_to_group"] line_offset = kwargs["line_offset"] # Add to dictionary annotate_kwargs["line_offset_to_group"] = line_offset_to_group annotate_kwargs["line_offset"] = line_offset # Set order for groups on plot if group_order: group_order = group_order # hide ns label if hide_ns: pairs, pvalue_plot, data_is_sig = _option_to_hide_ns(hide_ns, pairs, pvalue_plot) else: data_is_sig = True # set orientation for plot and Annotator orient = orient.lower() if orient == "v": x_input = group_col y_input = value_col elif orient == "h": x_input = value_col y_input = group_col else: raise ValueError("Orientation must be 'v' or 'h'!") # set optional subgroup_col if subgroup_col: subgroup_hue = subgroup_col else: subgroup_hue = group_col # plot ax = sns.violinplot(data=self.data, x=x_input, y=y_input, order=group_order, palette=palette, hue=subgroup_hue, **sns_kwargs) annotator = Annotator(ax, pairs=pairs, data=self.data, x=x_input, y=y_input, order=group_order, verbose=False, orient=orient, hue=subgroup_hue, **annot_kwargs) # optional input for custom annotations if pvalue_label: pvalue_plot = pvalue_label # Location of annotations if loc not in ["inside", "outside"]: raise ValueError("Invalid loc! Only 'inside' or 'outside' are accepted!") if loc == "inside": annotator.configure(loc=loc, test=None) else: annotator.configure(loc=loc, test=None) # Make sure the pairs and pvalue lists match if len(pairs) != len(pvalue_plot): raise Exception("pairs and pvalue_order length does not match!") # if no significance elif not data_is_sig: pass else: annotator.set_custom_annotations(pvalue_plot) annotator.annotate(**annotate_kwargs) # Adjust title and title titlesize from kwargs title = kwargs.pop("title", None) titlesize = kwargs.pop("titlesize", None) if title: plt.title(title) if title and titlesize: plt.title(title, fontsize=titlesize) plt.tight_layout() # Return DataFrame AND figure if return_df: return stat_df_result, annotator else: return annotator
[docs] def swarmplot(self, test=None, group_col=None, value_col=None, group_order=None, subgroup_col=None, subject_col=None, within_subject_col=None, pairs=None, pvalue_label=None, hide_ns=False, palette=None, orient="v", loc="inside", return_df=None, **kwargs): """ Draw a swarm plot from the input DataFrame. :param test: String Name of test for calculations. Names must match the test names from the py50.Stats() :param group_col: String Name of column containing groups. This should be the between depending on the selected test. :param value_col: String Name of the column containing the values. This is the dependent variable. :param group_order: List. Place the groups in a specific order on the plot. :param subgroup_col: String Name of the column containing the subgroup for the group column. This is associated with the hue parameters in Seaborn. :param subject_col: String Name of the column containing the subject column. :param within_subject_col: String Name of the column containing the within subject column. :param pairs: List A list containing specific pairings for annotation on the plot. :param pvalue_label: List. A list containing specific pvalue labels. This order must match the length of pairs list. :param hide_ns: bool Automatically hide groups with no significance from plot. :param palette: String or List. Color palette used for the plot. Can be given as common color name or in hex code. :param orient: String Orientation of the plot. Only "v" and "h" are for vertical and horizontal, respectively, is supported :param loc: String Set location of annotations. Only "inside" or "outside" are supported. :param return_df: Boolean Returns a DataFrame of calculated results. If pairs used, only return rows with annotated pairs. :return: """ # remove palette/hue warning warnings.filterwarnings("ignore", category=FutureWarning) # separate kwargs for sns and sns valid_sns = utils.get_kwargs(sns.swarmplot) valid_annot = utils.get_kwargs(Annotator) sns_kwargs = {key: value for key, value in kwargs.items() if key in valid_sns} annot_kwargs = {key: value for key, value in kwargs.items() if key in valid_annot} # Perform Stat calculations and get pairs and pvalue for annotation pairs, pvalue_plot, stat_df = Plots._get_test(self, group_col, kwargs, pairs, subgroup_col, subject_col, within_subject_col, test, value_col) # Set kwargs dictionary for line annotations annotate_kwargs = {} if "line_offset_to_group" in kwargs and "line_offset" in kwargs: # Get kwargs from input line_offset_to_group = kwargs["line_offset_to_group"] line_offset = kwargs["line_offset"] # Add to dictionary annotate_kwargs["line_offset_to_group"] = line_offset_to_group annotate_kwargs["line_offset"] = line_offset # Set order for groups on plot if group_order: group_order = group_order # hide ns label if hide_ns: pairs, pvalue_plot, data_is_sig = _option_to_hide_ns(hide_ns, pairs, pvalue_plot) else: data_is_sig = True # set orientation for plot and Annotator orient = orient.lower() if orient == "v": x_input = group_col y_input = value_col elif orient == "h": x_input = value_col y_input = group_col else: raise ValueError("Orientation must be 'v' or 'h'!") # set optional subgroup_col if subgroup_col: subgroup_hue = subgroup_col else: subgroup_hue = group_col # plot ax = sns.swarmplot(data=self.data, x=x_input, y=y_input, order=group_order, palette=palette, hue=subgroup_hue, **sns_kwargs) annotator = Annotator(ax, pairs=pairs, data=self.data, x=x_input, y=y_input, order=group_order, verbose=False, orient=orient, hue=subgroup_hue, **annot_kwargs, ) # optional input for custom annotations if pvalue_label: pvalue_plot = pvalue_label # # For debugging pairs and pvalue list orders # print(pairs) # print(pvalue) # Location of annotations if loc not in ["inside", "outside"]: raise ValueError("Invalid loc! Only 'inside' or 'outside' are accepted!") if loc == "inside": annotator.configure(loc=loc, test=None) else: annotator.configure(loc=loc, test=None) # Make sure the pairs and pvalue lists match if len(pairs) != len(pvalue_plot): raise Exception("pairs and pvalue_order length does not match!") # if no significance elif not data_is_sig: pass else: annotator.set_custom_annotations(pvalue_plot) annotator.annotate(**annotate_kwargs) # Adjust title and title titlesize from kwargs title = kwargs.pop("title", None) titlesize = kwargs.pop("titlesize", None) if title: plt.title(title) if title and titlesize: plt.title(title, fontsize=titlesize) plt.tight_layout() # Return DataFrame AND figure if return_df: return stat_df, annotator return annotator
[docs] def stripplot(self, test=None, group_col=None, value_col=None, group_order=None, subgroup_col=None, subject_col=None, within_subject_col=None, pairs=None, pvalue_label=None, hide_ns=False, palette=None, orient="v", loc="inside", return_df=None, **kwargs): """ Draw a stripplot from the input DataFrame. :param test: String Name of test for calculations. Names must match the test names from the py50.Stats() :param group_col: String Name of column containing groups. This should be the between depending on the selected test. :param value_col: String Name of the column containing the values. This is the dependent variable. :param group_order: List. Place the groups in a specific order on the plot. :param subgroup_col: String Name of the column containing the subgroup for the group column. This is associated with the hue parameters in Seaborn. :param subject_col: String Name of the column containing the subject column. :param within_subject_col: String Name of the column containing the within subject column. :param pairs: List A list containing specific pairings for annotation on the plot. :param pvalue_label: List. A list containing specific pvalue labels. This order must match the length of pairs list. :param hide_ns: bool Automatically hide groups with no significance from plot. :param palette: String or List. Color palette used for the plot. Can be given as common color name or in hex code. :param orient: String Orientation of the plot. Only "v" and "h" are for vertical and horizontal, respectively, is supported :param loc: String Set location of annotations. Only "inside" or "outside" are supported. :param return_df: Boolean Returns a DataFrame of calculated results. If pairs used, only return rows with annotated pairs. :return: """ # remove palette/hue warning warnings.filterwarnings("ignore", category=FutureWarning) # separate kwargs for sns and sns valid_sns = utils.get_kwargs(sns.stripplot) valid_annot = utils.get_kwargs(Annotator) sns_kwargs = {key: value for key, value in kwargs.items() if key in valid_sns} annot_kwargs = {key: value for key, value in kwargs.items() if key in valid_annot} # Perform Stat calculations and get pairs and pvalue for annotation pairs, pvalue_plot, stat_df_result = Plots._get_test(self, group_col, kwargs, pairs, subgroup_col, subject_col, within_subject_col, test, value_col) # Set kwargs dictionary for line annotations annotate_kwargs = {} if "line_offset_to_group" in kwargs and "line_offset" in kwargs: # Get kwargs from input line_offset_to_group = kwargs["line_offset_to_group"] line_offset = kwargs["line_offset"] # Add to dictionary annotate_kwargs["line_offset_to_group"] = line_offset_to_group annotate_kwargs["line_offset"] = line_offset # Set order for groups on plot if group_order: group_order = group_order # hide ns label if hide_ns: pairs, pvalue_plot, data_is_sig = _option_to_hide_ns(hide_ns, pairs, pvalue_plot) else: data_is_sig = True # set orientation for plot and Annotator orient = orient.lower() if orient == "v": x_input = group_col y_input = value_col elif orient == "h": x_input = value_col y_input = group_col else: raise ValueError("Orientation must be 'v' or 'h'!") # set optional subgroup_col if subgroup_col: subgroup_hue = subgroup_col else: subgroup_hue = group_col # plot ax = sns.stripplot(data=self.data, x=x_input, y=y_input, order=group_order, palette=palette, hue=subgroup_hue, **sns_kwargs) annotator = Annotator(ax, pairs=pairs, data=self.data, x=x_input, y=y_input, order=group_order, verbose=False, orient=orient, hue=subgroup_hue, **annot_kwargs) # optional input for custom annotations if pvalue_label: pvalue_plot = pvalue_label # # For debugging pairs and pvalue list orders # print(pairs) # print(pvalue) # Location of annotations if loc not in ["inside", "outside"]: raise ValueError("Invalid loc! Only 'inside' or 'outside' are accepted!") if loc == "inside": annotator.configure(loc=loc, test=None) else: annotator.configure(loc=loc, test=None) # Make sure the pairs and pvalue lists match if len(pairs) != len(pvalue_plot): raise Exception("pairs and pvalue_order length does not match!") # if no significance elif not data_is_sig: pass else: annotator.set_custom_annotations(pvalue_plot) annotator.annotate(**annotate_kwargs) # Adjust title and title titlesize from kwargs title = kwargs.pop("title", None) titlesize = kwargs.pop("titlesize", None) if title: plt.title(title) if title and titlesize: plt.title(title, fontsize=titlesize) plt.tight_layout() # Return DataFrame AND figure if return_df: return stat_df_result, annotator return annotator
[docs] def boxenplot(self, test=None, group_col=None, value_col=None, group_order=None, subgroup_col=None, subject_col=None, within_subject_col=None, pairs=None, pvalue_label=None, hide_ns=False, palette=None, orient="v", loc="inside", return_df=None, **kwargs): """ Draw a boxenplot from the input DataFrame. :param test: String Name of test for calculations. Names must match the test names from the py50.Stats() :param group_col: String Name of column containing groups. This should be the between depending on the selected test. :param value_col: String Name of the column containing the values. This is the dependent variable. :param group_order: List. Place the groups in a specific order on the plot. :param subgroup_col: String Name of the column containing the subgroup for the group column. This is associated with the hue parameters in Seaborn. :param subject_col: String Name of the column containing the subject column. :param within_subject_col: String Name of the column containing the within subject column. :param pairs: List A list containing specific pairings for annotation on the plot. :param pvalue_label: List. A list containing specific pvalue labels. This order must match the length of pairs list. :param hide_ns: bool Automatically hide groups with no significance from plot. :param palette: String or List. Color palette used for the plot. Can be given as common color name or in hex code. :param orient: String Orientation of the plot. Only "v" and "h" are for vertical and horizontal, respectively, is supported :param loc: String Set location of annotations. Only "inside" or "outside" are supported. :param return_df: Boolean Returns a DataFrame of calculated results. If pairs used, only return rows with annotated pairs. :return: """ # separate kwargs for sns and sns valid_sns = utils.get_kwargs(sns.boxenplot) valid_annot = utils.get_kwargs(Annotator) sns_kwargs = {key: value for key, value in kwargs.items() if key in valid_sns} annot_kwargs = {key: value for key, value in kwargs.items() if key in valid_annot} # Perform Stat calculations and get pairs and pvalue for annotation pairs, pvalue_plot, stat_df_result = Plots._get_test(self, group_col, kwargs, pairs, subgroup_col, subject_col, within_subject_col, test, value_col) # Set kwargs dictionary for line annotations annotate_kwargs = {} if "line_offset_to_group" in kwargs and "line_offset" in kwargs: # Get kwargs from input line_offset_to_group = kwargs["line_offset_to_group"] line_offset = kwargs["line_offset"] # Add to dictionary annotate_kwargs["line_offset_to_group"] = line_offset_to_group annotate_kwargs["line_offset"] = line_offset # Set order for groups on plot if group_order: group_order = group_order # hide ns label if hide_ns: pairs, pvalue_plot, data_is_sig = _option_to_hide_ns(hide_ns, pairs, pvalue_plot) else: data_is_sig = True # set orientation for plot and Annotator orient = orient.lower() if orient == "v": x_input = group_col y_input = value_col elif orient == "h": x_input = value_col y_input = group_col else: raise ValueError("Orientation must be 'v' or 'h'!") # set optional subgroup_col if subgroup_col: subgroup_hue = subgroup_col else: subgroup_hue = group_col # plot ax = sns.boxenplot(data=self.data, x=x_input, y=y_input, order=group_order, palette=palette, hue=subgroup_hue, **sns_kwargs) annotator = Annotator(ax, pairs=pairs, data=self.data, x=x_input, y=y_input, order=group_order, verbose=False, orient=orient, hue=subgroup_hue, **annot_kwargs) # optional input for custom annotations if pvalue_label: pvalue_plot = pvalue_label # # For debugging pairs and pvalue list orders # print(pairs) # print(pvalue) # Location of annotations if loc not in ["inside", "outside"]: raise ValueError("Invalid loc! Only 'inside' or 'outside' are accepted!") if loc == "inside": annotator.configure(loc=loc, test=None) else: annotator.configure(loc=loc, test=None) # Make sure the pairs and pvalue lists match if len(pairs) != len(pvalue_plot): raise Exception("pairs and pvalue_order length does not match!") # if no significance elif not data_is_sig: pass else: annotator.set_custom_annotations(pvalue_plot) annotator.annotate(**annotate_kwargs) # Adjust title and title titlesize from kwargs title = kwargs.pop("title", None) titlesize = kwargs.pop("titlesize", None) if title: plt.title(title) if title and titlesize: plt.title(title, fontsize=titlesize) plt.tight_layout() # Return DataFrame AND figure if return_df: return stat_df_result, annotator return annotator
[docs] def ci_plot(self, data: Optional = None, value_col: str = None, group_col: str = None, alpha: float = 0.05, title: str = "Tukey HSD Confidence Intervals", xlabel: str = None, ylabel: str = None, linewidth: float = 1.5, figsize: tuple = (8, 6), return_stats: bool = False): """ Generate a confidence interval plot. The plot utilizes the Tukey Honest Significant Difference (HSD) test and is a wrapper for statsmodels (https://www.statsmodels.org/dev/index.html). ANOVA will also be calculated and its p-value will be plotted alongside the title. :param data: Optional Input dataset. :param value_col: str Name of the column containing the dependent variable. :param group_col: str Name of the column containing the groups. :param alpha: float The significance level for the test. :param title: str Set the title for the figure. Defaults to "Tukey HSD Confidence Intervals". :param xlabel: str Set the label for the x-axis. If None is given, defaults to the value_col input. :param ylabel: str Set the label for the y-axis. If None is given, defaults to the group_col input. :param linewidth: float Set the width of the lines. :param figsize: tuple Set the figure size. Defaults to (8,6). :param return_stats: bool Whether to return the Tukey HSD test. :return: """ if data is None: data = self.data # calculate tukey using statsmodels tukey = pairwise_tukeyhsd(endog=data[value_col], groups=data[group_col], alpha=alpha) # calculate anova stat = Stats(data=self.data) anova_table = stat.get_anova(value_col=value_col, group_col=group_col) anova = anova_table.p_unc.iloc[0] # identify highest mean group_means = data.groupby(group_col)[value_col].mean() best_group = group_means.idxmax() # set labels if xlabel is None: xlabel = value_col if ylabel is None: ylabel = group_col # optional return of calculated stats if return_stats: print(tukey.summary()) # plot CI fig, ax = plt.subplots(figsize=figsize) tukey.plot_simultaneous(comparison_name=best_group, ax=ax) ax = fig.axes[0] # change line thickness for collection in ax.collections: collection.set_linewidth(linewidth) # dynamically set center dot size for line in ax.lines: if line.get_linestyle() != '--': # avoid dash vertical original_marker_size = 8 marker_size = original_marker_size * (linewidth / 2) if marker_size < original_marker_size: marker_size = original_marker_size line.set_markersize(marker_size) plt.title(f"{title} | ANOVA p={anova:.3f}", fontsize=18) plt.xlabel(xlabel, fontsize=12, labelpad=10) plt.ylabel(ylabel, fontsize=12, labelpad=10) plt.tight_layout()
# todo add support for lineplot def _lineplot(self, test=None, group_col=None, value_col=None, group_order=None, subgroup_col=None, subject_col=None, within_subject_col=None, pairs=None, pvalue_label=None, palette=None, orient="v", loc="inside", ci="sd", capsize=0.1, return_df=None, **kwargs): """ Draw a lineplot from the input DataFrame. :param test: String Name of test for calculations. Names must match the test names from the py50.Stats() :param group_col: String Name of column containing groups. This should be the between depending on the selected test. :param value_col: String Name of the column containing the values. This is the dependent variable. :param group_order: List. Place the groups in a specific order on the plot. :param subgroup_col: String Name of the column containing the subgroup for the group column. This is associated with the hue parameters in Seaborn. :param subject_col: String Name of the column containing the subject column. :param within_subject_col: String Name of the column containing the within subject column. :param pairs: List A list containing specific pairings for annotation on the plot. :param pvalue_label: List. A list containing specific pvalue labels. This order must match the length of pairs list. :param palette: String or List. Color palette used for the plot. Can be given as common color name or in hex code. :param orient: String Orientation of the plot. Only "v" and "h" are for vertical and horizontal, respectively, is supported :param loc: String Set location of annotations. Only "inside" or "outside" are supported. :param ci: String Set confidence interval on plot. :param capsize: Int Set cap size on plot. :param return_df: Boolean Returns a DataFrame of calculated results. If pairs used, only return rows with annotated pairs. :return: """ # separate kwargs for sns and sns valid_sns = utils.get_kwargs(sns.lineplot) valid_annot = utils.get_kwargs(Annotator) sns_kwargs = {key: value for key, value in kwargs.items() if key in valid_sns} annot_kwargs = {key: value for key, value in kwargs.items() if key in valid_annot} # Perform Stat calculations and get pairs and pvalue for annotation pairs, pvalue_plot, stat_df_result = Plots._get_test(self, group_col, kwargs, pairs, subgroup_col, subject_col, within_subject_col, test, value_col) # Set kwargs dictionary for line annotations annotate_kwargs = {} if "line_offset_to_group" in kwargs and "line_offset" in kwargs: # Get kwargs from input line_offset_to_group = kwargs["line_offset_to_group"] line_offset = kwargs["line_offset"] # Add to dictionary annotate_kwargs["line_offset_to_group"] = line_offset_to_group annotate_kwargs["line_offset"] = line_offset # Set order for groups on plot if group_order: group_order = group_order # set orientation for plot and Annotator orient = orient.lower() if orient == "v": x_input = group_col y_input = value_col elif orient == "h": x_input = value_col y_input = group_col else: raise ValueError("Orientation must be 'v' or 'h'!") # set optional subgroup_col if subgroup_col: subgroup_hue = subgroup_col else: subgroup_hue = group_col # plot # ci and capsize for errorbar ax = sns.lineplot(data=self.data, x=x_input, y=y_input, order=group_order, palette=palette, hue=subgroup_hue, ci=ci, capsize=capsize, **sns_kwargs) annotator = Annotator(ax, pairs=pairs, data=self.data, x=x_input, y=y_input, order=group_order, verbose=False, orient=orient, hue=subgroup_hue, **annot_kwargs) # optional input for custom annotations if pvalue_label: pvalue_plot = pvalue_label # Location of annotations if loc not in ["inside", "outside"]: raise ValueError("Invalid loc! Only 'inside' or 'outside' are accepted!") if loc == "inside": annotator.configure(loc=loc, test=None) else: annotator.configure(loc=loc, test=None) # Make sure the pairs and pvalue lists match if len(pairs) != len(pvalue_plot): raise Exception("pairs and pvalue_order length does not match!") else: annotator.set_custom_annotations(pvalue_plot) annotator.annotate(**annotate_kwargs) # Adjust title and title titlesize from kwargs if "title" in kwargs: plt.title(kwargs["title"]) if "title" and "titlesize" in kwargs: plt.title(kwargs["title"], titlesize=kwargs["titlesize"]) plt.tight_layout() # Return DataFrame AND figure if return_df: return stat_df_result, annotator return annotator
[docs] def p_matrix(self, data=None, cmap=None, title=None, titlesize=14, linewidths=0.01, linecolor="gray", **kwargs): """ Wrapper function for scikit_posthoc heatmap. :param data: Pandas.Dataframe Input table must be a matrix calculated using the stats.get_p_matrix(). Optional. :param cmap: List A list of colors. Can be color names or hex codes. :param title: String Input title for figure. :param title_titlesize: Int Set size of figure legend. :param linewidths: Int Set line width of figure. :param linecolor: String Set line color. Can be color name or hex code. :param kwargs: Optional Keyword arguemnts associated with [scikit-posthocs](https://scikit-posthocs.readthedocs.io/en/latest/) :return: Pyplot figure """ if data is None: data = self.data if title: plt.title(title, fontsize=titlesize) if cmap is None: # cmap list for 1, NS, 0.001, 0.01, 0.05 cmap = ["1", "#fbd7d4", "#005a32", "#238b45", "#a1d99b"] fig = sp.sign_plot(data, cmap=cmap, linewidths=linewidths, linecolor=linecolor, **kwargs) else: fig = sp.sign_plot(data, cmap=cmap, linewidths=linewidths, linecolor=linecolor, **kwargs) # Display plot return fig
""" Functions to plot data distribution """
[docs] def distribution(self, val_col=None, type="histplot", **kwargs): """ :param self: Pandas.Dataframe Input data. :param val_col: String The name of the column containing the dependent variable. :param type: String The type of figure drawn. For distribution, only "histplot" or "qqplot" supported :param kwargs: Optional keyword arguments for seaborn or pg.qqplot. :return: figure """ # Incorporate params from sns.histplot and pg.qq valid_hist = utils.get_kwargs(sns.histplot) valid_qq = utils.get_kwargs(pg.qqplot) hist_kwargs = {key: value for key, value in kwargs.items() if key in valid_hist} qq_kwargs = {key: value for key, value in kwargs.items() if key in valid_qq} if type == "histplot": fig = sns.histplot(data=self.data, x=val_col, **hist_kwargs) elif type == "qqplot": fig = pg.qqplot(self.data[val_col], dist="norm", **qq_kwargs) else: raise ValueError("For test parameter, only 'histplot' or 'qqplot' available") return fig
def _get_test(self, group_col, kwargs, pairs, subgroup_col, subject_col, within_subject_col, test, value_col): """ Function to obtain the pvalues and pairs for annotating the plot. :param self: self.df :param group_col: group_col :param kwargs: kwargs :param pairs: pairs :param subgroup_col: subgroup_col :param test: test :param value_col: value_col :return: """ global stat_df, pvalue # Check input test and run calculation if test == "tukey": # Get kwargs for pingouin valid_pg = utils.get_kwargs(pg.pairwise_tukey) pg_kwargs = {key: value for key, value in kwargs.items() if key in valid_pg} stat_df = Stats(self.data).get_tukey(value_col=value_col, group_col=group_col, **pg_kwargs) """Get pvalue and pairs from table""" # result_df has removed rows with n.s. This is only needed if plot has specific pairs input stat_df = _get_pair_subgroup(stat_df, hue=pairs) pvalue = [utils.star_value(value) for value in stat_df.p_tukey.tolist()] pairs = [(a, b) for a, b in zip(stat_df["A"], stat_df["B"])] elif test == "gameshowell": # Get kwargs for pingouin valid_pg = utils.get_kwargs(pg.pairwise_gameshowell) pg_kwargs = {key: value for key, value in kwargs.items() if key in valid_pg} stat_df = Stats(self.data).get_gameshowell(value_col=value_col, group_col=group_col, **pg_kwargs) """Get pvalue and pairs from table""" # result_df has removed rows with n.s. This is only needed if plot has specific pairs input stat_df = _get_pair_subgroup(stat_df, hue=pairs) pvalue = [utils.star_value(value) for value in stat_df.pval.tolist()] pairs = [(a, b) for a, b in zip(stat_df["A"], stat_df["B"])] elif test == "pairwise-rm": # Get kwargs for pingouin valid_pg = utils.get_kwargs(pg.pairwise_tests) pg_kwargs = {key: value for key, value in kwargs.items() if key in valid_pg} # print(pg_kwargs) stat_df = Stats(self.data).get_pairwise_rm(value_col=value_col, group_col=group_col, within_subject_col=within_subject_col, subject_col=subject_col, parametric=True, **pg_kwargs) """Get pvalue and pairs from table""" # result_df has removed rows with n.s. This is only needed if plot has specific pairs input stat_df = _get_pair_subgroup(stat_df, hue=pairs) pvalue = [utils.star_value(value) for value in stat_df.p_unc.tolist()] pairs = [(a, b) for a, b in zip(stat_df["A"], stat_df["B"])] elif test == "pairwise-mixed": # Get kwargs for pingouin valid_pg = utils.get_kwargs(pg.pairwise_tests) pg_kwargs = {key: value for key, value in kwargs.items() if key in valid_pg} stat_df = Stats(self.data).get_pairwise_mixed(value_col=value_col, group_col=group_col, within_subject_col=within_subject_col, subgroup_col=subject_col, parametric=True, **pg_kwargs) """Get pvalue and pairs from table""" # result_df has removed rows with n.s. This is only needed if plot has specific pairs input stat_df = _get_pair_subgroup(stat_df, hue=pairs) pvalue = [utils.star_value(value) for value in stat_df.p_unc.tolist()] pairs = [(a, b) for a, b in zip(stat_df["A"], stat_df["B"])] elif test == "pairwise-nonparametric": # Get kwargs for pingouin valid_pg = utils.get_kwargs(pg.pairwise_tests) pg_kwargs = {key: value for key, value in kwargs.items() if key in valid_pg} stat_df = Stats(self.data).get_pairwise_tests(value_col=value_col, group_col=group_col, within_subject_col=within_subject_col, subject_col=subject_col, parametric=False, **pg_kwargs) """Get pvalue and pairs from table""" # result_df has removed rows with n.s. This is only needed if plot has specific pairs input stat_df = _get_pair_subgroup(stat_df, hue=pairs) pvalue = [utils.star_value(value) for value in stat_df.p_unc.tolist()] pairs = [(a, b) for a, b in zip(stat_df["A"], stat_df["B"])] elif test == "pairwise-parametric": # Get kwargs for pingouin valid_pg = utils.get_kwargs(pg.pairwise_tests) pg_kwargs = {key: value for key, value in kwargs.items() if key in valid_pg} stat_df = Stats(self.data).get_pairwise_tests(value_col=value_col, group_col=group_col, within_subject_col=within_subject_col, subject_col=subgroup_col, parametric=True, **pg_kwargs) """Get pvalue and pairs from table""" # result_df has removed rows with n.s. This is only needed if plot has specific pairs input stat_df = _get_pair_subgroup(stat_df, hue=pairs) pvalue = [utils.star_value(value) for value in stat_df.p_unc.tolist()] pairs = [(a, b) for a, b in zip(stat_df["A"], stat_df["B"])] elif test == "wilcoxon": # Get kwargs for pingouin valid_pg = utils.get_kwargs(pg.wilcoxon) pg_kwargs = {key: value for key, value in kwargs.items() if key in valid_pg} stat_df = Stats(self.data).get_wilcoxon(value_col=value_col, group_col=group_col, subgroup_col=subgroup_col, **pg_kwargs) """Get pvalue and pairs from table""" # result_df has removed rows with n.s. This is only needed if plot has specific pairs input stat_df = _get_pair_subgroup(stat_df, hue=pairs) pvalue = [utils.star_value(value) for value in stat_df["p-val"].tolist()] pairs = [(a, b) for a, b in zip(stat_df["A"], stat_df["B"])] elif test == "mannu": # Get kwargs for pingouin valid_pg = utils.get_kwargs(pg.mwu) pg_kwargs = {key: value for key, value in kwargs.items() if key in valid_pg} stat_df = Stats(self.data).get_mannu(value_col=value_col, group_col=group_col, subgroup_col=subgroup_col, alternative="two-sided", **pg_kwargs) """Get pvalue and pairs from table""" # result_df has removed rows with n.s. This is only needed if plot has specific pairs input stat_df = _get_pair_subgroup(stat_df, hue=pairs) pvalue = [utils.star_value(value) for value in stat_df["p-val"].tolist()] pairs = [(a, b) for a, b in zip(stat_df["A"], stat_df["B"])] elif test == "kruskal": # Get kwargs for pingouin valid_pg = utils.get_kwargs(pg.kruskal) pg_kwargs = {key: value for key, value in kwargs.items() if key in valid_pg} stat_df = Stats(self.data).get_kruskal(value_col=value_col, group_col=group_col, **pg_kwargs) """Get pvalue and pairs from table""" # result_df has removed rows with n.s. This is only needed if plot has specific pairs input stat_df = _get_pair_subgroup(stat_df, hue=pairs) pvalue = [utils.star_value(value) for value in stat_df.p_unc.tolist()] pairs = [(a, b) for a, b in zip(stat_df["A"], stat_df["B"])] else: print(f"Plotting not supported for {test}!") return pairs, pvalue, stat_df
def _get_pair_subgroup(df, hue=None): """Generate pairs by group_col and hue. Hue will designate which input rows to keep for plotting.""" if hue is None: hue = _get_pairs(df) # Convert filter_values to a set of tuples. Both directions are generated for checking df pairs. forward_set = {tuple(x) for x in hue} reverse_set = {(y, x) for (x, y) in forward_set} # Combine columns A and B into a single column of tuples df["AB"] = list(zip(df["A"], df["B"])) # Filtering DataFrame based on filter values filtered_df = (df[df["AB"].isin(forward_set) | df["AB"].isin(reverse_set)].copy().reset_index(drop=True)) # Make pairs between groups and subgroups by df filtered_df = _sort_df(filtered_df, hue) # Drop the combined column AB if not needed in the final output filtered_df.drop("AB", axis=1, inplace=True) return filtered_df def _get_pairs(df): # Support function to make pairs form dataframe into a list of tuples pairs = [(a, b) for a, b in zip(df["A"], df["B"])] return pairs # Custom sorting function def _pair_sort(list_order, row): # Support function to make pairs between groups and subgroups by df try: # Check both possible orders of the tuple index = list_order.index((row["A"], row["B"])) except ValueError: try: index = list_order.index((row["B"], row["A"])) except ValueError: # If the row tuple is not found in the desired_order list, assign a high index index = len(list_order) return index # Sort the DataFrame based on the custom sorting function def _sort_df(df, list_order): # Support function to make pairs between groups and subgroups by df sorted_indices = df.apply(lambda row: _pair_sort(list_order, row), axis=1) return df.iloc[sorted_indices.argsort()] # support function for cases where hide_ns is true, and nothing in plot is significant def _option_to_hide_ns(hide_ns: bool, pairs: Union[list[tuple[Any, Any]], Any], pvalue_plot: list[str]): # If set to True, only show plots with significance if hide_ns: # Filter n.s. from pvalue and pairs hidden_sigfig_data = [(item1, item2) for item1, item2 in zip(pvalue_plot, pairs) if item1 != "n.s."] if hidden_sigfig_data: # Unzip the filtered data into pvalue and pairs variables pvalue_plot, pairs = zip(*hidden_sigfig_data) data_is_sig = True else: data_is_sig = False warnings.warn("No Significant Values found after filtering. Plot drawn without annotations.") else: data_is_sig = True # to track sigfig for annotations return pairs, pvalue_plot, data_is_sig if __name__ == "__main__": import doctest doctest.testmod()