Source code for moonstone.plot.counts

import logging
import numpy as np
import pandas as pd
import plotly.graph_objects as go
from typing import Optional, List, Tuple, Union

from scipy.cluster import hierarchy
from scipy.spatial import distance

from moonstone.plot.graphs.bargraph import BarGraph, MatrixBarGraph
from moonstone.plot.graphs.box import GroupBoxGraph
from moonstone.plot.graphs.violin import GroupViolinGraph
from moonstone.utils.dict_operations import merge_dict
from moonstone.utils.plot import (
    add_x_to_plotting_options,
    add_default_titles_to_plotting_options,
    add_groups_annotations,
)
from moonstone.utils.pandas.series import SeriesBinning

logger = logging.getLogger(__name__)

# What people might want to visualize?


[docs]class PlotCountsStats: """ Several plots available to visualize simple count data. """
[docs] def __init__(self, dataframe: pd.DataFrame, items_name: str = "items"): self.df = dataframe self.items_name = items_name
[docs] def plot_mean_distribution( self, plotting_options: dict = None, show: Optional[bool] = True, output_file: Optional[str] = False, ): """ method to visualize the mean distribution of the number of reads by items :param show: set to False if you don't want to show the plot :param output_file: name of the output file :param plotting_options: options of plotting that will override the default setup \n [!] Make sure the value given to an argument is of the right type \n options allowed : 'log': `bool` ; 'colorbar': `[str, List[str]]` ; 'tickangle': `[int, float]` """ if plotting_options is None: plotting_options = { "layout": { "title_text": "Distribution of %s mean" % self.items_name, "title_x": 0.5, "yaxis_type": "log", }, "xaxes": {"title_text": "sex", "tickangle": -60}, "yaxes": {"title_text": "number of samples"}, } else: plotting_options = add_default_titles_to_plotting_options( plotting_options, "Distribution of %s mean" % self.items_name, "mean of the number of reads", "number of samples", ) plotting_options = add_x_to_plotting_options(plotting_options, "layout", "yaxis_type", "log") plotting_options = add_x_to_plotting_options(plotting_options, "xaxes", "tickangle", -60) mean_series = self.df.mean(axis=1) binned_mean = SeriesBinning(mean_series).binned_data bar_fig = BarGraph(binned_mean, plotting_options, show=show, output_file=output_file) bar_fig.plot_one_graph(plotting_options, show=show, output_file=output_file)
[docs]class PlotTaxonomyCounts: """ Plots available for taxonomy counts (multiindexed dataframe). """
[docs] def __init__(self, taxonomy_dataframe: pd.DataFrame): if isinstance(taxonomy_dataframe, pd.Series): self.df = pd.DataFrame(taxonomy_dataframe) else: self.df = taxonomy_dataframe
[docs] def compute_prevalence_series(self) -> pd.Series: return (self.df != 0).sum(axis=1) / self.df.shape[1] * 100
@property def prevalence_series(self): # call compute_prevalence_series and store into self._prevalence_series if getattr(self, "_prevalence_series", None) is None: self._prevalence_series = self.compute_prevalence_series() return self._prevalence_series
[docs] def compute_relative_abundance_dataframe(self) -> pd.DataFrame: return self.df * 100 / self.df.sum()
@property def relative_abundance_dataframe(self): # call compute_relative_abundance_dataframe and store into self._relative_abundance_dataframe if getattr(self, "_relative_abundance_dataframe", None) is None: self._relative_abundance_dataframe = self.compute_relative_abundance_dataframe() return self._relative_abundance_dataframe def _valid_mode_param(self, mode: str) -> str: if mode[:3] == "box": return "boxplot" if mode[:6] == "violin": return "violin" if mode[:3] == "bar": return "bargraph" logger.warning("mode='%s' not valid, set to default (bargraph).", mode) return "bargraph" def _add_mean_info_to_index(self, data_df: Union[pd.DataFrame, pd.Series], mean_counts_ser_taxa: pd.Series): """ add the mean of the taxa among samples at the end of its name in the index. """ mean_top_prev = mean_counts_ser_taxa.loc[data_df.index] make_float_legend = lambda x: " (mean={:,.2f})".format(x) # noqa mean_top_prev = mean_top_prev.apply(make_float_legend) data_df.index = data_df.index + mean_top_prev.astype("str") return data_df def _italicize_taxa_name(self, text: str) -> str: """ put <i> and </i> around the taxa name, so that it will be shown in italic in the graph. It leaves the mean info or the higher classification between parenthesis, unitalicized. Args: text: string that contains taxa name to italicize """ s = text.split(" (") taxa_name = s[0].replace("_", " ") taxa_name = "<i>" + taxa_name + "</i>" if len(s) > 1: end = " (" + " (".join(s[1:]) return taxa_name + end else: return taxa_name def _generate_list_species_to_plot( self, determining_ser_taxa: pd.Series, other_variable_ser_taxa: pd.Series, taxa_number: int = 20, determining_threshold: float = None, higher_classification: bool = True, threshold_on_other_variable: float = None, ascending: bool = False, ) -> pd.core.indexes.base.Index: """ generate the list of species to plot, the most abundant/prevalent Args: determining_ser_taxa: Series used to compute the top most abundant/prevalent taxa. other_variable_ser_taxa: Series used to filter some taxa out based on the other statistical variable. (so on prevalence, if plotting most abundant taxa, and on mean counts, if plotting most prevalent taxa) taxa_number: Number of taxa to plot (skipped by determining_threshold). determining_threshold: (optional) Set a threshold, if rather than show a certain number of taxa, you want to show all taxa with an equal or higher prevalence/relative abundance. higher_classification: Set to False, if you do not want OTU only defined at a higher level to appear in the top. They will still be included in relative abundances. threshold_on_other_variable: (optional) The threshold of the other variable used to filter some taxa out. ascending: Set to True, if you want the taxa to be ordered from least abundant/prevalent taxa of the top, to most abundant/prevalent taxa of the top. """ # determining_ser_taxa: mean_relab_ser_taxa for abundance; prev_ser_taxa for prevalence # other_variable_ser_taxa: prev_ser_taxa for abundance; mean_counts_ser_taxa for prevalence # determining_threshold: average_relative_abundance_threshold for abundance; prevalence_threshold for prevalence # threshold_on_other_variable: prevalence_threshold for abundance; mean_threshold for prevalence if not higher_classification: other_variable_ser_taxa = other_variable_ser_taxa[ ~other_variable_ser_taxa.index.str.contains("(", regex=False) ] if threshold_on_other_variable: sp_to_keep = other_variable_ser_taxa[other_variable_ser_taxa >= threshold_on_other_variable].index else: sp_to_keep = other_variable_ser_taxa.index determining_ser_taxa = determining_ser_taxa.loc[sp_to_keep] if determining_threshold: top_species = determining_ser_taxa[determining_ser_taxa >= determining_threshold] top_species = top_species.sort_values(ascending=ascending).index else: top_species = determining_ser_taxa.sort_values(ascending=False)[:taxa_number].index if ascending: top_species = top_species[::-1] if len(top_species) == 0: logger.warning( "No species abide by the threshold(s) given. You may want to try to lower your threshold(s)." ) return top_species def _cluster_samples(self, df: pd.DataFrame) -> pd.DataFrame: """ Reorder samples with clustering of given order list. Args: df: dataframe whose index needs to be reordered """ # Determine samples order using hierarchical clustering Z = hierarchy.linkage( distance.pdist(df.drop(["Others"]).T), method="single", metric="euclidean", optimal_ordering=False, ) order = hierarchy.leaves_list(Z) return df.iloc[:, order] def _divide_samples_into_subgroups_and_reorder( self, top_and_other_df: pd.DataFrame, sep_series: pd.Series, cluster_samples: bool = True, ) -> Tuple[pd.DataFrame, List[Tuple[float, float, float]], np.ndarray]: """ divide samples into subgroups and reorder them using hierarchical clustering if cluster_samples is True. Args: top_and_other_df: DataFrame of the relative abundances of the top taxa and of Others (every taxa not in the top is considered as Others). sep_series: Series of the metadata used to divide samples. cluster_samples: Set to False, if you don't want the samples to be clusterize using hierarchical clustering. """ ordered_col = [] x_coor = [] prec = -0.5 subgps = sep_series.unique() for subgp in subgps: if type(subgp) != str and np.isnan(subgp): df_gp = top_and_other_df[sep_series[sep_series.isna()].index.intersection(top_and_other_df.columns)] else: df_gp = top_and_other_df[sep_series[sep_series == subgp].index.intersection(top_and_other_df.columns)] if cluster_samples and len(df_gp.columns) > 1: tmp = list(self._cluster_samples(df_gp).columns) ordered_col += tmp else: ordered_col += list(df_gp.columns) nb = len(df_gp.columns) med = nb / 2 x_coor += [(prec, prec + med, prec + nb)] # (x of the start of the subgroup annotation square, # x of the annotation text, # x of the end of the subgroup annotation square) prec += nb top_and_other_df = top_and_other_df[ordered_col] return top_and_other_df, x_coor, subgps def _compute_relative_abundances_taxa_dataframe( self, taxa_level: str = "species", taxa_number: int = 20, average_relative_abundance_threshold: float = None, higher_classification: bool = True, prevalence_threshold: float = None, ascending: bool = False, ) -> pd.DataFrame: """ Compute for each samples, the relative abundances for the n (taxa_number) most abundant taxa across all the samples. Args: taxa_level: Taxonomy level. taxa_number: Number of taxa to plot (skipped by average_relative_abundance_threshold). average_relative_abundance_threshold: (optional) Set a threshold, if you want to show all taxa with an equal or greater average relative abundance. higher_classification: Set to False, if you do not want OTU only defined at a higher level to appear in the top. They will still be included in the relative abundances. prevalence_threshold: Prevalence threshold for a taxa to be kept in analysis. ascending: If set to True, from top to bottom, from least prevalent taxa of the top to most prevalent taxa. """ relab_df_taxa = self.relative_abundance_dataframe.groupby(taxa_level).sum() # if taxa isn't the lowest taxonomical level, it sums up all counts of the same taxa # if taxa is the lowest taxonomical level, it drops the higher taxonomical levels in index # MultiIndex -> (single) Index mean_relab_ser_taxa = relab_df_taxa.mean(axis=1) prev_ser_taxa = self.prevalence_series.groupby(taxa_level).mean() top_ab = self._generate_list_species_to_plot( mean_relab_ser_taxa, prev_ser_taxa, taxa_number, average_relative_abundance_threshold, higher_classification, prevalence_threshold, ascending, ) taxa_number = len(top_ab) # for prevalence_thresholds case # and also in the case that there is less species that the number asked for return relab_df_taxa.loc[top_ab], taxa_number, mean_relab_ser_taxa.loc[top_ab] def _plot_most_abundant_taxa_bargraph( self, taxa_level: str = "species", taxa_number: int = 20, average_relative_abundance_threshold: float = None, higher_classification: bool = True, prevalence_threshold: float = None, ascending: bool = False, plotting_options: dict = {}, **kwargs, ) -> go.Figure: """ Generate Bar Graph of the most abundant taxa Args: taxa_level: Taxonomy level. taxa_number: Number of taxa to plot (skipped by average_relative_abundance_threshold). average_relative_abundance_threshold: (optional) Set a threshold, if you want to show all species with an equal or greater average relative abundance. higher_classification: Set to False, if you do not want OTU only defined at a higher level to appear in the top. They will still be included in relative abundances. prevalence_threshold: Prevalence threshold for a taxa to be kept in analysis. ascending: If set to True, from top to bottom, from least abundant taxa of the top to most abundant taxa. plotting_options: options for the layout of the graph. """ (taxa_number, average_abundance_ser,) = self._compute_relative_abundances_taxa_dataframe( taxa_level=taxa_level, taxa_number=taxa_number, average_relative_abundance_threshold=average_relative_abundance_threshold, higher_classification=higher_classification, prevalence_threshold=prevalence_threshold, ascending=bool(1 - ascending), )[1:] average_abundance_ser.index = average_abundance_ser.index.map(self._italicize_taxa_name) # Make graph graph = BarGraph(average_abundance_ser) # Plotting options title = f"{taxa_number} most abundant {taxa_level}" if prevalence_threshold: title += f" (present in at least {prevalence_threshold}% of samples)" default_plotting_options = { "layout": { "title": title, "xaxis_title": "Average relative abundance", "yaxis_title": taxa_level.capitalize(), } } plotting_options = merge_dict(plotting_options, default_plotting_options) fig = graph.plot_one_graph(orientation="h", plotting_options=plotting_options, **kwargs) return fig def _plot_most_what_taxa_boxplot_or_violin( self, what: str, mode: str, taxa_level: str = "species", taxa_number: int = 20, determining_threshold: float = None, higher_classification: bool = True, threshold_on_other_variable: bool = False, ascending: bool = False, plotting_options: dict = {}, mean_info: bool = False, **kwargs, ) -> go.Figure: """ Generate Box or Violin plot showing every samples' relative abundance as a point, for the top most abundant/prevalent taxa. Args: what: { "abundant", "prevalent" } Variable used to determine and order the taxa shown in the graph. mode: { "boxplot", "violin" } Mode of the graph . taxa_level: Taxonomy level. taxa_number: Number of taxa to plot (skipped by determining_threshold). determining_threshold: (optional) Set a threshold, if rather than show a certain number of taxa, you want to show all taxa with an equal or higher prevalence/relative abundance. higher_classification: Set to False, if you do not want OTU only defined at a higher level to appear in the top. They will still be included in relative abundances. threshold_on_other_variable: (optional) Set a threshold to filter some taxa out of top, based on the other variable. ascending: If set to True, from top to bottom, from least abundant/prevalent taxa of the top to most abundant/prevalent taxa. mean_info: To show the taxa's mean counts in the graph, next to the taxa's name. plotting_options: options for the layout of the graph. """ # The kind of plot for both most abundant and most prevalent, except for the taxa represented title = "" ascending = bool(1 - ascending) # to have from top to bottom, the most abundant/prevalent species to the least abundant/prevalent species of the # top, you need to sort_values descendingly relab_df_taxa = self.relative_abundance_dataframe.groupby(taxa_level).sum() # if taxa isn't the lowest taxonomical level, it sums up all counts of the same taxa # if taxa is the lowest taxonomical level, it drops the higher taxonomical levels in index # MultiIndex -> (single) Index prev_ser_taxa = self.prevalence_series.groupby(taxa_level).mean() if what == "abundant": mean_relab_ser_taxa = relab_df_taxa.mean(axis=1) groups = self._generate_list_species_to_plot( mean_relab_ser_taxa, prev_ser_taxa, taxa_number, determining_threshold, # determining_threshold = average_relative_abundance_threshold higher_classification, threshold_on_other_variable, # threshold_on_other_variable = prevalence_threshold ascending, ) relab_df_taxa = relab_df_taxa.loc[groups] if threshold_on_other_variable: # threshold_on_other_variable = prevalence_threshold title = f" (present in at least {threshold_on_other_variable}% of samples)" if what == "prevalent": mean_counts_ser_taxa = self.df.groupby(taxa_level).sum().mean(axis=1) groups = self._generate_list_species_to_plot( prev_ser_taxa, mean_counts_ser_taxa, taxa_number, determining_threshold, # determining_threshold = prevalence_threshold higher_classification, threshold_on_other_variable, # threshold_on_other_variable = mean_threshold ascending=ascending, ) relab_df_taxa = relab_df_taxa.loc[groups] if mean_info: # adding mean information relab_df_taxa = self._add_mean_info_to_index(relab_df_taxa, mean_counts_ser_taxa) groups = list(relab_df_taxa.index) if threshold_on_other_variable: # threshold_on_other_variable = mean_threshold title = f" (with mean among samples > {threshold_on_other_variable})" nb = relab_df_taxa.shape[0] relab_df_taxa2 = relab_df_taxa[relab_df_taxa.columns[0]].reset_index() relab_df_taxa2.index = nb * [relab_df_taxa.columns[0]] relab_df_taxa2.columns = ["species", "relative abundance"] for i in relab_df_taxa.columns[1:]: tmp = relab_df_taxa[i].reset_index() tmp.index = nb * [i] tmp.columns = ["species", "relative abundance"] relab_df_taxa2 = relab_df_taxa2.append(tmp) relab_df_taxa2.species = relab_df_taxa2.species.apply(self._italicize_taxa_name) groups = [self._italicize_taxa_name(name) for name in groups] # Make graph if mode == "violin": graph = GroupViolinGraph(relab_df_taxa2) else: graph = GroupBoxGraph(relab_df_taxa2) # Plotting options final_colors = {name: "#778899" for name in groups} default_plotting_options = { "layout": { "title": f"Relative abundance of the {len(groups)} most {what} microbial genomes among individuals \ of the cohort" + title, "xaxis_type": "log", "showlegend": False, }, "xaxes": {"title_text": "Relative abundance (in percentage)"}, } plotting_options = merge_dict(plotting_options, default_plotting_options) fig = graph.plot_one_graph( data_col="relative abundance", group_col="species", groups=groups, colors=final_colors, orientation="h", plotting_options=plotting_options, **kwargs, ) return fig def _plot_most_prevalent_taxa_bargraph( self, taxa_level: str = "species", taxa_number: int = 20, prevalence_threshold: float = None, higher_classification: bool = True, mean_threshold: float = None, mean_info: bool = True, ascending: bool = False, plotting_options: dict = {}, **kwargs, ) -> go.Figure: """ Generate Bar Graph of the most prevalent taxa Args: taxa_level: Taxonomy level. taxa_number: Number of taxa to plot (skipped by prevalence_threshold). prevalence_threshold: (optional) Set a threshold, if you want to show all species with an equal or greater prevalence. higher_classification: Set to False, if you do not want OTU only defined at a higher level to appear in the top. mean_threshold: Mean threshold for a taxa to be kept in analysis. mean_info: To show the taxa's mean counts in the graph, next to the taxa's name. ascending: If set to True, from top to bottom, from least prevalent taxa of the top to most prevalent taxa. plotting_options: options for the layout of the graph. """ ascending = bool(1 - ascending) title = "" prev_ser_taxa = self.prevalence_series.groupby(taxa_level).mean() # if taxa isn't the lowest taxonomical level, it sums up all counts of the same taxa # if taxa is the lowest taxonomical level, it drops the higher taxonomical levels in index # MultiIndex -> (single) Index mean_counts_ser_taxa = self.df.groupby(taxa_level).sum().mean(axis=1) top_prev = self._generate_list_species_to_plot( prev_ser_taxa, mean_counts_ser_taxa, taxa_number, prevalence_threshold, higher_classification, mean_threshold, ascending, ) prev_ser_taxa = prev_ser_taxa.loc[top_prev] if mean_info: # adding mean information prev_ser_taxa = self._add_mean_info_to_index(prev_ser_taxa, mean_counts_ser_taxa) prev_ser_taxa.index = prev_ser_taxa.index.map(self._italicize_taxa_name) if mean_threshold: title = f" (with mean among samples > {mean_threshold})" taxa_number = len(top_prev) # in the case that there is less species that the number asked for # Make graph graph = BarGraph(prev_ser_taxa) # Plotting options default_plotting_options = { "layout": { "title": f"{taxa_number} most prevalent {taxa_level}" + title, "xaxis_title": "Percentage Sample", "yaxis_title": taxa_level.capitalize(), } } plotting_options = merge_dict(plotting_options, default_plotting_options) fig = graph.plot_one_graph(orientation="h", plotting_options=plotting_options, **kwargs) return fig
[docs] def plot_most_prevalent_taxa( self, mode: str = "bargraph", taxa_level: str = "species", taxa_number: int = 20, prevalence_threshold: float = None, higher_classification: bool = True, mean_threshold: float = None, mean_info: bool = False, ascending: bool = False, **kwargs, ) -> go.Figure: """ Generate a plot of most prevalent taxa. Args: mode: { 'bargraph' (default), 'boxplot', 'violin' } Bargraph will show you the prevalence of the most prevalent taxa among all the samples. Boxplot and violin plot will show every samples' relative abundance as a point, for the top most prevalent taxa. taxa_level: Taxonomy level. taxa_number: Number of taxa to plot (skipped by prevalence_threshold). prevalence_threshold: (optional) Set a threshold, if you want to show all species with an equal or greater prevalence. higher_classification: Set to False, if you do not want OTU only defined at a higher level to appear in the top. mean_threshold: Mean threshold for a taxa to be kept in analysis. mean_info: To show the taxa's mean counts in the graph, next to the taxa's name. ascending: If set to True, from top to bottom, from least prevalent taxa of the top to most prevalent taxa. """ plotting_options = kwargs.pop("plotting_options", {}) mode = self._valid_mode_param(mode) if mode == "bargraph": fig = self._plot_most_prevalent_taxa_bargraph( taxa_level=taxa_level, taxa_number=taxa_number, prevalence_threshold=prevalence_threshold, higher_classification=higher_classification, mean_threshold=mean_threshold, mean_info=mean_info, ascending=ascending, plotting_options=plotting_options, **kwargs, ) else: fig = self._plot_most_what_taxa_boxplot_or_violin( "prevalent", mode, taxa_level=taxa_level, taxa_number=taxa_number, determining_threshold=prevalence_threshold, higher_classification=higher_classification, threshold_on_other_variable=mean_threshold, ascending=ascending, plotting_options=plotting_options, mean_info=mean_info, **kwargs, ) return fig
[docs] def plot_most_abundant_taxa( self, mode: str = "bargraph", taxa_level: str = "species", taxa_number: int = 20, average_relative_abundance_threshold: float = None, higher_classification: bool = True, prevalence_threshold: float = None, ascending: bool = False, **kwargs, ) -> go.Figure: """ Generate a plot of most abundant taxa. Args: mode: { 'bargraph' (default), 'boxplot', 'violin' } Bargraph will show you the mean relative abundance of the most abundant species among all the samples. Boxplot and violin plot will show every samples' relative abundance as a point. taxa_level: Taxonomy level. taxa_number: Number of taxa to plot. average_relative_abundance_threshold: (optional) Set a threshold, if you want to show all species with an equal or greater average relative abundance. higher_classification: Set to False, if you do not want OTU only defined at a higher level to appear in the top. They will still be included in the relative abundances. prevalence_threshold: Prevalence threshold for a taxa to be kept in analysis. ascending: If set to True, from top to bottom, from least abundant taxa of the top to most abundant taxa. """ plotting_options = kwargs.pop("plotting_options", {}) mode = self._valid_mode_param(mode) if mode == "bargraph": fig = self._plot_most_abundant_taxa_bargraph( taxa_level=taxa_level, taxa_number=taxa_number, average_relative_abundance_threshold=average_relative_abundance_threshold, higher_classification=higher_classification, prevalence_threshold=prevalence_threshold, ascending=ascending, plotting_options=plotting_options, **kwargs, ) else: fig = self._plot_most_what_taxa_boxplot_or_violin( "abundant", mode, taxa_level, taxa_number, determining_threshold=average_relative_abundance_threshold, higher_classification=higher_classification, threshold_on_other_variable=prevalence_threshold, ascending=ascending, plotting_options=plotting_options, **kwargs, ) return fig
[docs] def plot_sample_composition_most_abundant_taxa( self, taxa_level: str = "species", taxa_number: int = 20, average_relative_abundance_threshold: float = None, higher_classification: bool = True, prevalence_threshold: float = None, cluster_samples: bool = True, samples_order: List[str] = None, color_df: pd.DataFrame = None, sep_series: pd.Series = None, sep_how: str = None, **kwargs, ) -> go.Figure: """ Plot taxa composition of samples for most abundant taxa. Args: taxa_level: Taxonomy level. taxa_number: Number of taxa to plot (skipped by average_relative_abundance_threshold). average_relative_abundance_threshold: (optional) Set a threshold, if you want to show all species with an equal or greater average relative abundance. higher_classification: Set to False, if you do not want OTU only defined at a higher level to appear in the top. They will still appear in "Others". prevalence_threshold: Prevalence threshold for a taxa to be kept in analysis. cluster_samples: Use clustering (skipped by samples_order). samples_order: List of samples to force ordering for visualization. color_df: Metadata to put as legend on the bottom of the graph. sep_series: Metadata used to order samples into subgroups (skipped by samples_order). sep_how: { None (default), 'color', 'labels' } Graphical way of showing the separation of the different subgroups (skipped if sep_series is empty/None). """ data_df, taxa_number = self._compute_relative_abundances_taxa_dataframe( taxa_level=taxa_level, taxa_number=taxa_number, average_relative_abundance_threshold=average_relative_abundance_threshold, higher_classification=higher_classification, prevalence_threshold=prevalence_threshold, ascending=False, )[:2] data_df.loc["Others"] = 100 - data_df.sum() if data_df.shape[1] <= 1: # only 1 sample, no need for ordering sep_series = None elif samples_order is not None: data_df = data_df.loc[:, samples_order] elif sep_series is not None: # organize samples inside subgroups and concatenate subgroups one after another data_df, x_coor, subgps = self._divide_samples_into_subgroups_and_reorder( data_df, sep_series, cluster_samples=cluster_samples ) if sep_how == "color": if color_df is None: color_df = pd.DataFrame(sep_series) elif sep_series.name not in color_df.columns: color_df = color_df.merge(pd.DataFrame(sep_series), right_index=True, left_index=True) elif cluster_samples: data_df = self._cluster_samples(data_df) # Make graph graph = MatrixBarGraph(data_df) # Plotting options title = f"{taxa_level.capitalize()} composition for the top {taxa_number} most abundant species across samples" if prevalence_threshold is not None: title += f" (present in at least {prevalence_threshold}% of samples)" default_plotting_options = { "layout": { "title": title, "xaxis_title": "Samples", "yaxis_title": "Relative abundance", "legend": {"traceorder": "normal"}, "legend_title_text": "species", } } plotting_options = merge_dict(kwargs.pop("plotting_options", {}), default_plotting_options) if sep_series is not None and sep_how == "labels": show = kwargs.pop("show", True) output_file = kwargs.pop("output_file", False) if color_df is None: fig = graph.plot_one_graph(plotting_options=plotting_options, **kwargs, show=False) else: fig = graph.plot_complex_graph(color_df, plotting_options=plotting_options, **kwargs, show=False) fig = add_groups_annotations(fig, x_coor, subgps) graph._handle_output_plotly(fig, show, output_file) else: if color_df is None: fig = graph.plot_one_graph(plotting_options=plotting_options, **kwargs) else: fig = graph.plot_complex_graph(color_df, plotting_options=plotting_options, **kwargs) return fig