Source code for moonstone.filtering.taxonomy_filtering

from typing import List

import pandas as pd

from moonstone.filtering.base import CountsFiltering


[docs]class TaxonomyNamesFiltering(CountsFiltering): """ Filtering a Taxonomy multiindexed dataframe on index names at a chosen level. """
[docs] def __init__( self, dataframe: pd.DataFrame, names: List[str], level: str = 'species', keep: bool = True ): """ :param names: list of index names :param level: level of the MultiIndex to filter on :param keep: keep column (discard them if set to False) """ super().__init__(dataframe) self.names = names self.keep = keep self.level = level self._validate_parameters()
def _validate_parameters(self): if self.level not in self.df.index.names: error_message = f"{self.level} not a valid level. Must be among {self.df.index.names}" raise ValueError(error_message)
[docs] def filter(self) -> pd.DataFrame: corresponding_rows = self.df.index.get_level_values(self.level).isin(self.names) if self.keep: return self.df.loc[corresponding_rows] else: return self.df.loc[~corresponding_rows]
[docs]class TaxonomyMeanFiltering(CountsFiltering): """ Filtering a Taxonomy multiindexed dataframe on sample mean at a chosen level. This means you select a mean value for all samples and it will discard all selected taxonomy below this mean. """
[docs] def __init__( self, dataframe: pd.DataFrame, mean_value: float, level: str = 'species' ): """ :param mean_value: mean among all samples to be kept :param level: level of the MultiIndex to filter on """ super().__init__(dataframe) self.mean_value = mean_value self.level = level self._validate_parameters()
def _validate_parameters(self): if self.level not in self.df.index.names: error_message = f"{self.level} not a valid level. Must be among {self.df.index.names}" raise ValueError(error_message)
[docs] def filter(self) -> pd.DataFrame: level_df = self.df.groupby(self.level).sum() selected_names = level_df[level_df.mean(axis=1) > self.mean_value].index corresponding_rows = self.df.index.get_level_values(self.level).isin(selected_names) return self.df.loc[corresponding_rows]