Source code for dms_tools2.diffsel

"""
===========
diffsel
===========

Performs operations related to estimating differential selection.
"""

import math
import os
import io
import tempfile

import natsort
import numpy
import pandas
import scipy.stats
from dms_tools2 import CODONS, CODON_TO_AA


[docs]def beta_diversity(tidy_df, *, samplecol, sitecol, valcol,
                   index, dropdups=True):
    """Computes beta diversity of selection across sites.

    Args:
        `tidy_df` (pandas DataFrame)
            Tidy data frame with selection values.
        `samplecol` (str)
            Column in `tidy_df` giving different samples.
        `sitecol` (str)
            Column in `tidy_df` giving sites.
        `valcol` (str)
            Column in `tidy_df` giving value (e.g., 'positive_diffsel')
        `index` {'shannon', 'simpson'}
            Type of diversity index to use.
        `dropdups` (bool)
            If duplicate columns for a site, drop them? Otherwise error.

    Returns:
        For each sample, the diversity is calculated by examining the
        total fraction of selection (as defined in `valcol`) allocated
        to each site (as defined in `sitecol`) and considering that to
        represent the weighted "species" abundance. Alpha and gamma diversity
        are then calculated using the indicated index as in Jost,
        "Partitioning diversity into independent alpha and beta components"
        (https://pdfs.semanticscholar.org/88c4/67e65b42eff088d19354632f41df6ca5804d.pdf).
        The beta diversity is the ratio of gamma to alpha diversity:
        larger values indicate the samples are more diverse in sites of selection.

    As an example, first create a data frame for three samples.
    Sample `a` is similar to `b`, but both are quite different from `c`:

    >>> a_df = pandas.DataFrame({
    ...     'sample': 'a',
    ...     'site': [1, 2, 3, 4],
    ...     'positive_diffsel': [0.2, 5.1, 0.2, 0.3]})
    >>> b_df = pandas.DataFrame({
    ...     'sample': 'b',
    ...     'site': [1, 2, 3, 4],
    ...     'positive_diffsel': [0.3, 4.2, 0.2, 0.3]})
    >>> c_df = pandas.DataFrame({
    ...     'sample': 'c',
    ...     'site': [1, 2, 3, 4],
    ...     'positive_diffsel': [0.3, 0.2, 0.2, 4.3]})
    >>> tidy_df = pandas.concat([a_df, b_df, c_df], ignore_index=True)
    >>> tidy_df
       sample  site  positive_diffsel
    0       a     1               0.2
    1       a     2               5.1
    2       a     3               0.2
    3       a     4               0.3
    4       b     1               0.3
    5       b     2               4.2
    6       b     3               0.2
    7       b     4               0.3
    8       c     1               0.3
    9       c     2               0.2
    10      c     3               0.2
    11      c     4               4.3

    Calculate beta diversity of all three samples using Shannon index.
    The value is relatively high because sample `c` is a lot different
    than `a` and `b`:

    >>> numpy.allclose(beta_diversity(tidy_df,
    ...                      samplecol='sample',
    ...                      sitecol='site',
    ...                      valcol='positive_diffsel',
    ...                      index='shannon'),
    ...                1.4914, atol=1e-4)
    True

    If we repeat the same using the Simpson index we get a higher
    diversity because the large-selection sites (which are up-weighted by
    Simpson index relative to Shannon) are diferent between `c` and `a` / `b`:

    >>> numpy.allclose(beta_diversity(tidy_df,
    ...                      samplecol='sample',
    ...                      sitecol='site',
    ...                      valcol='positive_diffsel',
    ...                      index='simpson'),
    ...                1.6478, atol=1e-4)
    True

    If we calculate the beta diversity of just samples `a` and `b`, we
    get a smaller value because those samples are quite similar:

    >>> numpy.allclose(beta_diversity(tidy_df.query('sample in ["a", "b"]'),
    ...                      samplecol='sample',
    ...                      sitecol='site',
    ...                      valcol='positive_diffsel',
    ...                      index='shannon'),
    ...                1.0022, atol=1e-4)
    True

    Here the Simpson index gives lower diversity than the Shannon since
    `a` and `b` share the largest selection site:

    >>> numpy.allclose(beta_diversity(tidy_df.query('sample in ["a", "b"]'),
    ...                      samplecol='sample',
    ...                      sitecol='site',
    ...                      valcol='positive_diffsel',
    ...                      index='simpson'),
    ...                1.0008, atol=1e-4)
    True

    And of course the beta diversity of two identical samples is one:

    >>> numpy.allclose(beta_diversity(
    ...                      pandas.concat([a_df, a_df.assign(sample='a2')]),
    ...                      samplecol='sample',
    ...                      sitecol='site',
    ...                      valcol='positive_diffsel',
    ...                      index='shannon'),
    ...                1, atol=1e-4)
    True
    >>> numpy.allclose(beta_diversity(
    ...                      pandas.concat([a_df, a_df.assign(sample='a2')]),
    ...                      samplecol='sample',
    ...                      sitecol='site',
    ...                      valcol='positive_diffsel',
    ...                      index='simpson'),
    ...                1, atol=1e-4)
    True

    """
    cols = [samplecol, sitecol, valcol]
    if set(cols) > set(tidy_df.columns):
        raise ValueError(f"`tidy_df` lacks required columns: {cols}")
    if len(tidy_df[samplecol].unique()) < 2:
        raise ValueError('`tidy_df` must have at least two unique samples')
    df = (tidy_df
          [cols]
          .drop_duplicates()
          )
    if (len(df) != len(tidy_df)) and not dropdups:
        raise ValueError(f"`tidy_df` has duplicate entries in columns: {cols}")
    sites = set(df[sitecol].unique())
    if any(set(idf[sitecol]) != sites for _, idf in df.groupby(samplecol)):
        raise ValueError('not same set of sites for all samples in `tidy_df`')
    if any(len(sites) != len(idf[sitecol]) for _, idf in df.groupby(samplecol)):
        raise ValueError('duplicate sites for a sample in `tidy_df`.')

    if 'normval' in cols:
        raise ValueError('`tidy_df` cannot have column named "normval"')
    df = (df
          .assign(normval=lambda x: (x[valcol] /
                                     (x.groupby(samplecol)
                                      [valcol]
                                      .transform('sum')
                                      )
                                     )
                  )
          [[samplecol, sitecol, 'normval']]
          )

    if index == 'shannon':
        gamma_diversity = math.exp(scipy.stats.entropy(
            df
            .groupby(sitecol)
            .aggregate({'normval': 'mean'})
            ['normval']
            ))
        alpha_diversity = math.exp(
            df
            .groupby(samplecol)
            ['normval']
            .apply(scipy.stats.entropy)
            .reset_index()
            ['normval']
            .mean()
            )
    elif index == 'simpson':
        gamma_diversity = 1 / (
            df
            .groupby(sitecol)
            .aggregate({'normval': 'mean'})
            .assign(normval=lambda x: x['normval']**2)
            ['normval']
            .sum()
            )
        alpha_diversity = 1 / (
            df
            .assign(normval=lambda x: x['normval']**2)
            .groupby(samplecol)
            ['normval']
            .aggregate('sum')
            .reset_index()
            ['normval']
            .mean()
            )
    else:
        raise ValueError(f"invalid `index` of {index}")

    return gamma_diversity / alpha_diversity


[docs]def tidyToWide(tidy_df, valuecol):
    """Converts tidy `diffsel` data frame to wide form.

    The `diffsel` data frames returned by ``dms2_diffsel`` (and
    also other dataframes, such as the `fracsurvive` ones
    from ``dms_fracsurvive`` are in tidy form. This function
    converts them to wide form.

    Args:
        `tidy_df` (pandas DataFrame)
            Data frame in tidy form. Should have columns named
            `site`, `wildtype`, `mutation`, and something
            with the name matching `valuecol`.
        `valuecol` (string)
            Name of value column in `df`, such `diffsel` or
            `fracsurvive`.

    Returns:
        Wide form dataframe. Will have columns `site` (as string), 
        `wildtype`, and all characters (e.g., amino acids)
        for which values are given. Natural sorted by `site`.

    >>> tidy_df = pandas.read_csv(io.StringIO(
    ...     '''site wildtype mutation diffsel
    ...           3        A        D    -1.5 
    ...           3        A        C    10.1
    ...           2        A        C    10.1
    ...           1        C        D     9.5
    ...           1        C        A     0.2
    ...           2        A        D    -1.5'''),
    ...     delim_whitespace=True, index_col=False)
    >>> wide_df = tidyToWide(tidy_df, valuecol='diffsel')
    >>> print(wide_df.to_string(float_format=lambda x: '{0:.1f}'.format(x)))
      site   A    C    D wildtype
    0    1 0.2  0.0  9.5        C
    1    2 0.0 10.1 -1.5        A
    2    3 0.0 10.1 -1.5        A
    """
    assert isinstance(tidy_df, pandas.DataFrame)
    cols = ['site', 'wildtype', 'mutation', valuecol]
    assert set(cols) == set(tidy_df.columns), ('expected columns '
            '{0}\nactual columns {1}'.format(cols, tidy_df.columns))

    # make site a string
    tidy_df['site'] = tidy_df['site'].astype(str)

    # sort on site as here: https://stackoverflow.com/a/29582718
    tidy_df = tidy_df.reindex(index=natsort.order_by_index(tidy_df.index,
            natsort.index_realsorted(tidy_df.site)))

    # convert to wide form, keeping wildtype identities
    tidy_df = tidy_df.set_index('site', drop=True)
    wt = tidy_df['wildtype']
    wide_df = (tidy_df.pivot(columns='mutation', values=valuecol)
                      .fillna(0.0)
                      .join(wt)
                      .reset_index()
                      )
    wide_df = wide_df.drop_duplicates().reset_index(drop=True)

    return wide_df


[docs]def computeMutDiffSel(sel, mock, countcharacters, pseudocount, 
        translate_to_aa, err=None, mincount=0):
    """Compute mutation differential selection.

    Args:
        `sel` (pandas.DataFrame)
            Counts for selected sample. Columns should be
            `site`, `wildtype`, and every character in
            `countcharacters`.
        `mock` (pandas.DataFrame)
            Like `sel` but counts for mock-selected sample.
        `countcharacters` (list)
            List of all characters (e.g., codons).
        `pseudocount` (int or float > 0)
            Pseudocount to add to counts.
        `translate_to_aa` (bool)
            Should be `True` if counts are for codons and we are
            estimating diffsel for amino acids, `False` otherwise.
        `err` (pandas.DataFrame or `None`)
            Optional error-control counts, in same format as `sel`.
        `mincount` (int >= 0)
            Report as `NaN` the mutdiffsel for any mutation in which
            neither `sel` nor `mock` has at least this many counts.

    Returns:
        A `pandas.DataFrame` with the mutation differential selection.
        Columns are `site`, `wildtype`, `mutation`, `mutdiffsel`.
    """
    assert pseudocount > 0

    expectedcols = set(['site', 'wildtype'] + countcharacters)
    assert set(sel.columns) == expectedcols, "Invalid columns for sel"
    assert set(mock.columns) == expectedcols, "Invalid columns for sel"

    sel = sel.sort_values('site')
    mock = mock.sort_values('site')
    assert all(sel['site'] == mock['site']), "Inconsistent sites"
    assert all(sel['wildtype'] == mock['wildtype']), "Inconsistent wildtype"

    if err is not None:
        assert (set(err.columns) == expectedcols), "Invalid columns for err"
        err = err.sort_values('site')
        assert all(err['site'] == sel['site']), "Inconsistent sites"
        assert all(err['wildtype'] == sel['wildtype']), "Inconsistent sites"

    # make melted dataframe for calculations
    m = None    
    for (df, name) in [(sel, 'sel'), (mock, 'mock'), (err, 'err')]:
        if (name == 'err') and (err is None):
            continue
        m_name = (df.melt(id_vars=['site', 'wildtype'],
                          var_name='mutation', value_name='n')
                    .sort_values(['site', 'mutation'])
                    .reset_index(drop=True)
                    )
        m_name['N'] = m_name.groupby('site').transform('sum')['n']
        if m is None:
            m = m_name[['site', 'wildtype', 'mutation']].copy()
        assert all([all(m[c] == m_name[c]) for c in 
                ['site', 'wildtype', 'mutation']])
        m['n{0}'.format(name)] = m_name['n'].astype('float')
        m['N{0}'.format(name)] = m_name['N'].astype('float')

    # error correction 
    if err is not None:
        m['epsilon'] = m['nerr'] / m['Nerr']
        wtmask = (m['mutation'] == m['wildtype'])
        assert all(m[wtmask]['epsilon'] > 0), \
                "err counts of 0 for wildtype"
        for name in ['sel', 'mock']:
            ncol = 'n{0}'.format(name)
            Ncol = 'N{0}'.format(name)
            # follow here to set wildtype values without zero division
            wtval = pandas.Series(0, wtmask.index)
            wtval.loc[wtmask] = m.loc[wtmask, ncol] / m.loc[wtmask, 'epsilon']
            m[ncol] = numpy.maximum(0, 
                    (m[Ncol] * (m[ncol] / m[Ncol] - m['epsilon']))
                    ).where(~wtmask, wtval)
            m[Ncol] = m.groupby('site').transform('sum')[ncol]
        m = m.reset_index()
   
    # convert codon to amino acid counts
    if translate_to_aa:
        assert set(countcharacters) == set(CODONS),\
                "translate_to_aa specified, but not using codons"
        m = (m.replace({'wildtype':CODON_TO_AA, 'mutation':CODON_TO_AA})
              .groupby(['site', 'wildtype', 'mutation', 'Nsel', 'Nmock'])
              .sum()
              .reset_index()
              )

    # add pseudocounts
    m['nselP'] = m['nsel'] + pseudocount * numpy.maximum(1,
            m['Nsel'] / m['Nmock'])
    m['nmockP'] = m['nmock'] + pseudocount * numpy.maximum(1,
            m['Nmock'] / m['Nsel'])

    # create columns with wildtype counts
    m = m.merge(
            m.query('mutation==wildtype')[['site', 'nselP', 'nmockP']],
            on='site', suffixes=('', 'wt'))

    # compute mutdiffsel
    m['enrichment'] = (
            (m['nselP'] / m['nselPwt']) / (m['nmockP'] / m['nmockPwt'])
            ).where((m['mutation'] != m['wildtype']) &
                    ((m['nsel'] >= mincount) | (m['nmock'] >= mincount)),
                    numpy.nan)
    m['mutdiffsel'] = numpy.log2(m['enrichment'])

    return m[['site', 'wildtype', 'mutation', 'mutdiffsel']]


[docs]def mutToSiteDiffSel(mutdiffsel):
    """Computes sitediffsel from mutdiffsel.

    Args:
        `mutdiffsel` (pandas.DataFrame)
            Dataframe with mutdiffsel as returned by `computeMutDiffSel`

    Returns:
        The dataframe `sitediffsel`, which has the following columns:
            - `site`
            - `abs_diffsel`: sum of absolute values of mutdiffsel at site
            - `positive_diffsel`: sum of positive mutdiffsel at site
            - `negative_diffsel`: sum of negative mutdiffsel at site
            - `max_diffsel`: maximum mutdiffsel at site
            - `min_diffsel`: minimum mutdiffsel at site

    >>> mutdiffsel = (pandas.DataFrame({
    ...         'site':[1, 2, 3, 4],
    ...         'wildtype':['A', 'C', 'C', 'A'],
    ...         'A':[numpy.nan, 4.1, -0.1, numpy.nan],
    ...         'C':[-0.2, numpy.nan, numpy.nan, 0.3],
    ...         'G':[3.2, 0.1, -0.2, 0.1],
    ...         'T':[-0.2, 0.0, -0.1, 0.4],
    ...         })
    ...         .melt(id_vars=['site', 'wildtype'],
    ...               var_name='mutation', value_name='mutdiffsel')
    ...         .reset_index(drop=True)
    ...         )
    >>> sitediffsel = mutToSiteDiffSel(mutdiffsel)
    >>> all(sitediffsel.columns == ['site', 'abs_diffsel', 
    ...         'positive_diffsel', 'negative_diffsel', 
    ...         'max_diffsel', 'min_diffsel'])
    True
    >>> all(sitediffsel['site'] == [1, 2, 3, 4])
    True
    >>> numpy.allclose(sitediffsel['abs_diffsel'], [3.6, 4.2, 0.4, 0.8])
    True
    >>> numpy.allclose(sitediffsel['positive_diffsel'], [3.2, 4.2, 0, 0.8])
    True
    >>> numpy.allclose(sitediffsel['negative_diffsel'], [-0.4, 0, -0.4, 0])
    True
    >>> numpy.allclose(sitediffsel['max_diffsel'], [3.2, 4.1, 0, 0.4])
    True
    >>> numpy.allclose(sitediffsel['min_diffsel'], [-0.2, 0, -0.2, 0])
    True
    """
    aas = list(set(mutdiffsel.wildtype).union(mutdiffsel.mutation))
    sitediffsel = (mutdiffsel
                   .pivot(index='site', columns='mutation', values='mutdiffsel')
                   .fillna(0)
                   .assign(abs_diffsel=lambda x:
                                    x[aas].abs().sum(axis=1),
                           positive_diffsel=lambda x:
                                    x[aas][x[aas] >= 0].sum(axis=1),
                           negative_diffsel=lambda x:
                                    x[aas][x[aas] <= 0].sum(axis=1),
                           max_diffsel=lambda x:
                                    x[aas].max(axis=1),
                           min_diffsel=lambda x:
                                    x[aas].min(axis=1),
                          )
                   .reset_index()
                   [['site', 'abs_diffsel', 'positive_diffsel', 
                    'negative_diffsel', 'max_diffsel', 'min_diffsel']]
                   )
    return sitediffsel


[docs]def avgMutDiffSel(mutdiffselfiles, avgtype):
    """Gets mean or median mutation differential selection.

    Args:
        `mutdiffselfiles` (list)
            List of CSV files with mutdiffsel as returned by
            ``dms2_diffsel``.
        `avgtype` (str)
            Type of "average" to calculate. Possibilities:
                - `mean`
                - `median`

    Returns:
        A `pandas.DataFrame` containing the mean or median
        mutation differential selection.

    >>> tf = tempfile.NamedTemporaryFile
    >>> with tf(mode='w') as f1, tf(mode='w') as f2, tf(mode='w') as f3:
    ...     x = f1.write('site,wildtype,mutation,mutdiffsel\\n'
    ...                  '157,K,D,8.3\\n'
    ...                  '156,G,S,2.0')
    ...     f1.flush()
    ...     x = f2.write('site,wildtype,mutation,mutdiffsel\\n'
    ...                  '157,K,D,6.3\\n'
    ...                  '156,G,S,-1.1')
    ...     f2.flush()
    ...     x = f3.write('site,wildtype,mutation,mutdiffsel\\n'
    ...                  '157,K,D,2.2\\n'
    ...                  '156,G,S,0.0')
    ...     f3.flush()
    ...     mean = avgMutDiffSel([f1.name, f2.name, f3.name], 'mean')
    ...     median = avgMutDiffSel([f1.name, f2.name, f3.name], 'median')
    >>> (mean['site'] == [157, 156]).all()
    True
    >>> (median['site'] == [157, 156]).all()
    True
    >>> (mean['wildtype'] == ['K', 'G']).all()
    True
    >>> (median['wildtype'] == ['K', 'G']).all()
    True
    >>> (mean['mutation'] == ['D', 'S']).all()
    True
    >>> (median['mutation'] == ['D', 'S']).all()
    True
    >>> numpy.allclose(mean['mutdiffsel'], [5.6, 0.3])
    True
    >>> numpy.allclose(median['mutdiffsel'], [6.3, 0.0])
    True
    """
    diffsels = []
    cols = ['site', 'wildtype', 'mutation', 'mutdiffsel']
    for f in mutdiffselfiles:
        diffsels.append(pandas.read_csv(f)
                        [cols]
                        .sort_values(['site', 'wildtype', 'mutation'])
                        .reset_index()
                        )
        assert all([(diffsels[0][c] == diffsels[-1][c]).all() for c in
                ['site', 'wildtype', 'mutation']]), "files do not have same muts" 
    avg = pandas.concat(diffsels).groupby(cols[ : -1])
    if avgtype == 'mean':
        avg = avg.mean()
    elif avgtype == 'median':
        avg = avg.median()
    else:
        raise ValueError("invalid avgtype {0}".format(avgtype))
    return (avg.reset_index()
               .sort_values('mutdiffsel', ascending=False)
               [cols]
               .reset_index(drop=True)
               )


[docs]def df_read_filecols(df, filecols, *, order_sites=True):
    """Merges data frame with entries read from files.

    Designed to expand data frame listing CSV files with site or
    mutation-level selection information into a data frame listing
    the information in these CSV files.

    Args:
        `df` (pandas DataFrame)
            Each row gives files and associated information.
        `filecols` (list)
            List of columns in `df` that give filenames of CSV files
            to add to data frame. These CSV files cannot have column
            names already in `df`.
        `order_sites` (bool)
            Expect a `site` column, make it naturally sorted
            categorical variable, and add `isite` column
            that numbers sites 0, 1, ...

    Returns:
        A data frame where the entries in the files are now
        read as columns.

    >>> tf = tempfile.NamedTemporaryFile
    >>> with tf(mode='w') as sitediffsel1, tf(mode='w') as sitediffsel2, \\
    ...      tf(mode='w') as mutdiffsel1,  tf(mode='w') as mutdiffsel2:
    ...
    ...     # first sitediffsel file
    ...     _ = sitediffsel1.write('site,sitediffsel\\n'
    ...                            '1,3.2\\n'
    ...                            '-1,2.3\\n'
    ...                            '(HA2)1,0.1')
    ...     sitediffsel1.flush()
    ...
    ...     # first mutdiffsel file
    ...     _ = mutdiffsel1.write('site,wildtype,mutation,mutdiffsel\\n'
    ...                            '-1,A,C,-0.7\\n'
    ...                            '-1,A,G,3.0\\n'
    ...                            '1,C,A,1.2\\n'
    ...                            '1,C,G,2.0\\n'
    ...                            '(HA2)1,C,A,0.0\\n'
    ...                            '(HA2)1,C,G,0.1')
    ...     mutdiffsel1.flush()
    ...
    ...     # second sitediffsel file
    ...     _ = sitediffsel2.write('site,sitediffsel\\n'
    ...                            '(HA2)1,9.1\\n'
    ...                            '1,1.2\\n'
    ...                            '-1,0.3\\n')
    ...     sitediffsel2.flush()
    ...
    ...     # second mutdiffsel file
    ...     _ = mutdiffsel2.write('site,wildtype,mutation,mutdiffsel\\n'
    ...                            '-1,A,C,-0.2\\n'
    ...                            '-1,A,G,0.5\\n'
    ...                            '1,C,A,1.1\\n'
    ...                            '1,C,G,0.1\\n'
    ...                            '(HA2)1,C,A,9.0\\n'
    ...                            '(HA2)1,C,G,0.1')
    ...     mutdiffsel2.flush()
    ...
    ...     # data frame with files as columns
    ...     df = pandas.DataFrame({
    ...          'name':['sample_1', 'sample_2'],
    ...          'serum':['serum_1', 'serum_1'],
    ...          'sitediffsel_file':[sitediffsel1.name, sitediffsel2.name],
    ...          'mutdiffsel_file':[mutdiffsel1.name, mutdiffsel2.name]
    ...          })
    ...
    ...     # call df_read_filecols
    ...     (df_read_filecols(df, ['sitediffsel_file', 'mutdiffsel_file'])
    ...      .drop(columns=['sitediffsel_file', 'mutdiffsel_file']))
            name    serum    site  sitediffsel wildtype mutation  mutdiffsel  isite
    0   sample_1  serum_1       1          3.2        C        A         1.2      1
    1   sample_1  serum_1       1          3.2        C        G         2.0      1
    2   sample_1  serum_1      -1          2.3        A        C        -0.7      0
    3   sample_1  serum_1      -1          2.3        A        G         3.0      0
    4   sample_1  serum_1  (HA2)1          0.1        C        A         0.0      2
    5   sample_1  serum_1  (HA2)1          0.1        C        G         0.1      2
    6   sample_2  serum_1  (HA2)1          9.1        C        A         9.0      2
    7   sample_2  serum_1  (HA2)1          9.1        C        G         0.1      2
    8   sample_2  serum_1       1          1.2        C        A         1.1      1
    9   sample_2  serum_1       1          1.2        C        G         0.1      1
    10  sample_2  serum_1      -1          0.3        A        C        -0.2      0
    11  sample_2  serum_1      -1          0.3        A        G         0.5      0
    """
    if not len(df):
        raise ValueError('`df` has no rows')

    df_cols = set(df.columns)
    if 'dummy' in df_cols:
        raise ValueError('`df` has column named "dummy"')

    if not (set(filecols) <= df_cols):
        raise ValueError('`df` does not have all the `filecol` columns')

    df_filecols = []
    for row in df.iterrows():
        # get data frame of just row, with a dummy column for merging
        row_df = row[1].to_frame().transpose().assign(dummy=1)
        for col in filecols:
            filename = row_df.at[row[0], col]
            file_df = pandas.read_csv(filename).assign(dummy=1)
            if order_sites and 'site' not in file_df.columns:
                raise ValueError(f"no `site` column in {filename}")
            sharedcols = set(file_df.columns).intersection(df_cols)
            if sharedcols:
                raise ValueError(f"`df` and {filename} share columns "
                                 f"{sharedcols}")
            row_df = row_df.merge(file_df)
        df_filecols.append(row_df)

    df_filecols = (pandas.concat(df_filecols, ignore_index=True)
                   .drop('dummy', axis='columns')
                   )

    if order_sites:
        sites = natsort.realsorted(df_filecols['site'].unique())
        df_filecols = (
            df_filecols
            .assign(site=lambda x: pandas.Categorical(x['site'],
                                                      sites,
                                                      ordered=True),
                    isite=lambda x: x['site'].cat.codes)
            )

    return df_filecols



if __name__ == '__main__':
    import doctest
    doctest.testmod()
Source code for dms_tools2.diffsel

dms_tools2

Navigation

Related Topics