Source code for neutcurve.parse_excel

"""
==================
parse_excel
==================

Defines functions for parsing data directly from Excel in specific formats.
"""

import os

import pandas as pd


[docs] def parseRachelStyle2019( *, excelfile, sheet_mapping, ): """Parse data from Rachel Eguia's 2019-style assays on Bloom lab Tecan. Args: `excelfile` (str) Excel file with data exactly as produced by plate reader. `sheet_mapping` (dict) Describes data in each sheet of `excelfile`. There should be a key matching the name of each Excel sheet. The value for each key is another dict with the following keys / values describing the data for that plate (we assume one sheet per serum / virus pair): - 'serum': name of serum. - 'virus': name of virus. - 'dilution_factor': fold-dilution of serum along plate. - 'initial_concentration': serum concentration in first well used to start dilution series. - 'excluded_dilutions' (optional) : list of dilutions to exclude from returned results, with 1 being least dilute and 12 being most dilute. Useful if some columns in plate have a known problem. Returns: A pandas DataFrame in the tidy format read by :class:`neutcurve.curvefits.CurveFits`. """ if not os.path.isfile(excelfile): raise ValueError(f"cannot find `excelfile` {excelfile}") # choose engine: https://stackoverflow.com/a/65266270/4191652 if os.path.splitext(excelfile)[-1] == ".xls": engine = "xlrd" elif os.path.splitext(excelfile)[-1] == ".xlsx": engine = "openpyxl" else: raise ValueError(f"invalid extension in `excelfile` {excelfile}") sheet_data = pd.read_excel( excelfile, sheet_name=None, # read all sheets engine=engine, skiprows=range(0, 30), index_col=0, nrows=8, ) # keys of sheet_mapping should be str sheet_mapping = {str(key): val for key, val in sheet_mapping.items()} extra_sheets = set(sheet_data) - set(sheet_mapping) if extra_sheets: raise ValueError( f"`excelfile` {excelfile} has the following extra " f"sheets not in `sheet_mapping`: {extra_sheets}" ) neutdata = [] req_keys = ["dilution_factor", "initial_concentration", "serum", "virus"] optional_keys = ["excluded_dilutions"] for sheet, sheet_info in sheet_mapping.items(): if sheet not in sheet_data: raise ValueError( f"sheet {sheet} specified in `sheet_mapping` " f"is not present in `excelfile` {excelfile}" ) extra_keys = set(sheet_info.keys()) - set(req_keys + optional_keys) if extra_keys: raise ValueError( f"`sheet_mapping` has extra keys {extra_keys} " f"for sheet {sheet}. Allowed keys are {req_keys} " f"and optionally {optional_keys}." ) kwargs = {"layout": "RachelStyle2019"} for key in req_keys: try: kwargs[key] = sheet_info[key] except KeyError: raise ValueError( f"`sheet_mapping` does not specify {key} " f"for sheet {sheet}" ) for key in optional_keys: if key in sheet_info: kwargs[key] = sheet_info[key] neutdata.append( _parse_BloomLab_TecanPlate( sheet_df=sheet_data[sheet], sheet_name_for_err=f"sheet {sheet} of {excelfile}", **kwargs, ) ) return pd.concat(neutdata, ignore_index=True, sort=False)
def _parse_BloomLab_TecanPlate( *, sheet_df, sheet_name_for_err, initial_concentration, dilution_factor, layout, serum, virus, excluded_dilutions=(), ): """Process data frame for one 96-well plate of Bloom lab Tecan.""" # expected rows / columns rows = ["A", "B", "C", "D", "E", "F", "G", "H"] ncols = 12 all_cols = list(range(1, ncols + 1)) if layout == "RachelStyle2019": # reverse excluded dilution columns if assay dilutes columns in reverse exclude_cols = {ncols - col + 1 for col in excluded_dilutions} else: raise ValueError(f"invalid layout of {layout}") if exclude_cols > set(all_cols): raise ValueError("invalid col to exclude") retained_cols = [col for col in all_cols if col not in exclude_cols] if not retained_cols: raise ValueError("no cols retained after `exclude_cols`") # check for expected rows / columns if sheet_df.index.tolist() != rows: raise ValueError( f"{sheet_name_for_err} does not have expected rows.\n" f"Expected: {rows}\nGot: {sheet_df.index.tolist()}" ) if sheet_df.columns.tolist() != all_cols: raise ValueError( f"{sheet_name_for_err} does not have expected " f"columns.\nExpected: {all_cols}\n" f"Got: {sheet_df.columns.tolist()}" ) if not all(pd.api.types.is_numeric_dtype(sheet_df[c]) for c in all_cols): raise ValueError(f"Entries in {sheet_name_for_err} not all numerical") sheet_df = sheet_df[retained_cols] if layout == "RachelStyle2019": reps = { rep: sheet_df.loc[row] for rep, row in [("1", "D"), ("2", "E"), ("3", "F")] } no_serum = (sheet_df.loc["C"] + sheet_df.loc["G"]) / 2 virus_only = sheet_df.loc["B"] media_only = (sheet_df.loc["A"] + sheet_df.loc["H"]) / 2 if any(media_only > virus_only): raise ValueError( f"In {sheet_name_for_err}, the media-only " f"control has more signal than the virus-only " f"control, which is unexpected:\n{sheet_df}" ) # compute fraction infectivity for each replicate data = { rep: (repval - virus_only) / (no_serum - virus_only) for rep, repval in reps.items() } data["concentration"] = [ initial_concentration / dilution_factor ** (ncols - col) for col in retained_cols ] df = pd.DataFrame(data).melt( id_vars="concentration", var_name="replicate", value_name="fraction infectivity", ) else: raise ValueError(f"invalid `layout` of {layout}") return df.assign(serum=serum, virus=virus)[ ["serum", "virus", "replicate", "concentration", "fraction infectivity"] ] if __name__ == "__main__": import doctest doctest.testmod()