######## snakemake preamble start (automatically inserted, do not edit) ########
import sys; sys.path.extend(['/fh/fast/bloom_j/software/miniconda3/envs/seqneut-pipeline/lib/python3.12/site-packages', '/fh/fast/bloom_j/computational_notebooks/jbloom/2024/seqneut-pipeline', '/fh/fast/bloom_j/computational_notebooks/jbloom/2024/seqneut-pipeline/test_example/..', '/fh/fast/bloom_j/computational_notebooks/jbloom/2024/seqneut-pipeline/test_example', '/fh/fast/bloom_j/software/miniconda3/envs/seqneut-pipeline/bin', '/fh/fast/bloom_j/software/miniconda3/envs/seqneut-pipeline/lib/python3.12', '/fh/fast/bloom_j/software/miniconda3/envs/seqneut-pipeline/lib/python3.12/lib-dynload', '/fh/fast/bloom_j/software/miniconda3/envs/seqneut-pipeline/lib/python3.12/site-packages', '/home/jbloom/.cache/snakemake/snakemake/source-cache/runtime-cache/tmpxokz18n6/file/fh/fast/bloom_j/computational_notebooks/jbloom/2024/seqneut-pipeline/notebooks', '/fh/fast/bloom_j/computational_notebooks/jbloom/2024/seqneut-pipeline/notebooks']); import pickle; snakemake = pickle.loads(b'\x80\x04\x95Z\x16\x00\x00\x00\x00\x00\x00\x8c\x10snakemake.script\x94\x8c\tSnakemake\x94\x93\x94)\x81\x94}\x94(\x8c\x05input\x94\x8c\x0csnakemake.io\x94\x8c\nInputFiles\x94\x93\x94)\x81\x94(\x8c"results/plates/plate2/qc_drops.yml\x94\x8c#results/plates/plate11/qc_drops.yml\x94\x8c&results/plates/H3N2_plate/qc_drops.yml\x94\x8c0results/sera/pilot_A23038d0_r32_75K/qc_drops.yml\x94\x8c&results/sera/serum_M099d0/qc_drops.yml\x94\x8c\'results/sera/serum_M099d30/qc_drops.yml\x94\x8c\'results/sera/serum_Y044d30/qc_drops.yml\x94\x8c(results/sera/serum_Y154d182/qc_drops.yml\x94e}\x94(\x8c\x06_names\x94}\x94(\x8c\x0eplate_qc_drops\x94K\x00K\x03\x86\x94\x8c\x14groups_sera_qc_drops\x94K\x03K\x08\x86\x94u\x8c\x12_allowed_overrides\x94]\x94(\x8c\x05index\x94\x8c\x04sort\x94eh\x1b\x8c\tfunctools\x94\x8c\x07partial\x94\x93\x94h\x06\x8c\x19Namedlist._used_attribute\x94\x93\x94\x85\x94R\x94(h!)}\x94\x8c\x05_name\x94h\x1bsNt\x94bh\x1ch\x1fh!\x85\x94R\x94(h!)}\x94h%h\x1csNt\x94bh\x15h\x06\x8c\tNamedlist\x94\x93\x94)\x81\x94(h\nh\x0bh\x0ce}\x94(h\x13}\x94h\x19]\x94(h\x1bh\x1ceh\x1bh\x1fh!\x85\x94R\x94(h!)}\x94h%h\x1bsNt\x94bh\x1ch\x1fh!\x85\x94R\x94(h!)}\x94h%h\x1csNt\x94bubh\x17h,)\x81\x94(h\rh\x0eh\x0fh\x10h\x11e}\x94(h\x13}\x94h\x19]\x94(h\x1bh\x1ceh\x1bh\x1fh!\x85\x94R\x94(h!)}\x94h%h\x1bsNt\x94bh\x1ch\x1fh!\x85\x94R\x94(h!)}\x94h%h\x1csNt\x94bubub\x8c\x06output\x94h\x06\x8c\x0bOutputFiles\x94\x93\x94)\x81\x94(\x8c#results/qc_drops/plate_qc_drops.yml\x94\x8c)results/qc_drops/groups_sera_qc_drops.yml\x94e}\x94(h\x13}\x94(h\x15K\x00N\x86\x94h\x17K\x01N\x86\x94uh\x19]\x94(h\x1bh\x1ceh\x1bh\x1fh!\x85\x94R\x94(h!)}\x94h%h\x1bsNt\x94bh\x1ch\x1fh!\x85\x94R\x94(h!)}\x94h%h\x1csNt\x94bh\x15hIh\x17hJub\x8c\x06params\x94h\x06\x8c\x06Params\x94\x93\x94)\x81\x94(]\x94(\x8c\x06plate2\x94\x8c\x07plate11\x94\x8c\nH3N2_plate\x94e]\x94(\x8c\x05pilot\x94\x8c\x10A23038d0_r32_75K\x94\x86\x94\x8c\x05serum\x94\x8c\x06M099d0\x94\x86\x94hd\x8c\x07M099d30\x94\x86\x94hd\x8c\x07Y044d30\x94\x86\x94hd\x8c\x08Y154d182\x94\x86\x94ee}\x94(h\x13}\x94(\x8c\x06plates\x94K\x00N\x86\x94\x8c\x0bgroups_sera\x94K\x01N\x86\x94uh\x19]\x94(h\x1bh\x1ceh\x1bh\x1fh!\x85\x94R\x94(h!)}\x94h%h\x1bsNt\x94bh\x1ch\x1fh!\x85\x94R\x94(h!)}\x94h%h\x1csNt\x94bhoh\\hqh`ub\x8c\twildcards\x94h\x06\x8c\tWildcards\x94\x93\x94)\x81\x94}\x94(h\x13}\x94h\x19]\x94(h\x1bh\x1ceh\x1bh\x1fh!\x85\x94R\x94(h!)}\x94h%h\x1bsNt\x94bh\x1ch\x1fh!\x85\x94R\x94(h!)}\x94h%h\x1csNt\x94bub\x8c\x07threads\x94K\x01\x8c\tresources\x94h\x06\x8c\tResources\x94\x93\x94)\x81\x94(K\x01K\x01\x8c\x04/tmp\x94e}\x94(h\x13}\x94(\x8c\x06_cores\x94K\x00N\x86\x94\x8c\x06_nodes\x94K\x01N\x86\x94\x8c\x06tmpdir\x94K\x02N\x86\x94uh\x19]\x94(h\x1bh\x1ceh\x1bh\x1fh!\x85\x94R\x94(h!)}\x94h%h\x1bsNt\x94bh\x1ch\x1fh!\x85\x94R\x94(h!)}\x94h%h\x1csNt\x94bh\x93K\x01h\x95K\x01h\x97h\x90ub\x8c\x03log\x94h\x06\x8c\x03Log\x94\x93\x94)\x81\x94\x8c)results/qc_drops/aggregate_qc_drops.ipynb\x94a}\x94(h\x13}\x94\x8c\x08notebook\x94K\x00N\x86\x94sh\x19]\x94(h\x1bh\x1ceh\x1bh\x1fh!\x85\x94R\x94(h!)}\x94h%h\x1bsNt\x94bh\x1ch\x1fh!\x85\x94R\x94(h!)}\x94h%h\x1csNt\x94bh\xa9h\xa6ub\x8c\x06config\x94}\x94(\x8c\x10seqneut-pipeline\x94\x8c\x03../\x94\x8c\x04docs\x94\x8c\x07../docs\x94\x8c\x0bdescription\x94X\xba\x01\x00\x00# Test example for [seqneut-pipeline](https://github.com/jbloomlab/seqneut-pipeline)\nThis is a small toy-example created by subsetting a real experiment dataset.\n\nSee [https://github.com/jbloomlab/seqneut-pipeline](https://github.com/jbloomlab/seqneut-pipeline)\nfor the computer code and underlying numerical data.\n\nSee [here](https://github.com/jbloomlab/seqneut-pipeline/graphs/contributors) for a\nlist of all contributors to the pipeline.\n\x94\x8c\x0fviral_libraries\x94}\x94(\x8c\x14pdmH1N1_lib2023_loes\x94\x8c-data/viral_libraries/pdmH1N1_lib2023_loes.csv\x94\x8c\x13H3N2_lib2023_Kikawa\x94\x8c)data/viral_libraries/2023_H3N2_Kikawa.csv\x94u\x8c\x17viral_strain_plot_order\x94\x8c data/viral_strain_plot_order.csv\x94\x8c\x12neut_standard_sets\x94}\x94\x8c\x08loes2023\x94\x8c3data/neut_standard_sets/loes2023_neut_standards.csv\x94s\x8c\x1eillumina_barcode_parser_params\x94}\x94(\x8c\x08upstream\x94\x8c\x1fCTCCCTACAATGTCGGATTTGTATTTAATAG\x94\x8c\ndownstream\x94\x8c\x00\x94\x8c\x04minq\x94K\x14\x8c\x11upstream_mismatch\x94K\x04\x8c\x0ebc_orientation\x94\x8c\x02R2\x94u\x8c#default_process_plate_qc_thresholds\x94}\x94(\x8c\x1bavg_barcode_counts_per_well\x94K\xfa\x8c\x1fmin_neut_standard_frac_per_well\x94G?tz\xe1G\xae\x14{\x8c"no_serum_per_viral_barcode_filters\x94}\x94(\x8c\x08min_frac\x94G?@bM\xd2\xf1\xa9\xfc\x8c\x0fmax_fold_change\x94K\x03\x8c\tmax_wells\x94K\x02u\x8c!per_neut_standard_barcode_filters\x94}\x94(\x8c\x08min_frac\x94G?tz\xe1G\xae\x14{\x8c\x0fmax_fold_change\x94K\x03\x8c\tmax_wells\x94K\x02u\x8c min_neut_standard_count_per_well\x94M\xf4\x01\x8c)min_no_serum_count_per_viral_barcode_well\x94K\x1e\x8c+max_frac_infectivity_per_viral_barcode_well\x94K\x05\x8c)min_dilutions_per_barcode_serum_replicate\x94K\x06u\x8c%default_process_plate_curvefit_params\x94}\x94(\x8c\x18frac_infectivity_ceiling\x94K\x01\x8c\x06fixtop\x94]\x94(G?\xe8\x00\x00\x00\x00\x00\x00K\x01e\x8c\tfixbottom\x94K\x00\x8c\x08fixslope\x94]\x94(G?\xe9\x99\x99\x99\x99\x99\x9aK\neu\x8c!default_process_plate_curvefit_qc\x94}\x94(\x8c\x1dmax_frac_infectivity_at_least\x94K\x00\x8c\x0fgoodness_of_fit\x94}\x94(\x8c\x06min_R2\x94G?\xe6ffffff\x8c\x08max_RMSD\x94G?\xb9\x99\x99\x99\x99\x99\x9au\x8c#serum_replicates_ignore_curvefit_qc\x94]\x94\x8c+barcode_serum_replicates_ignore_curvefit_qc\x94]\x94u\x8c\x06plates\x94}\x94(h]}\x94(\x8c\x05group\x94\x8c\x05serum\x94\x8c\x04date\x94\x8c\x08datetime\x94\x8c\x04date\x94\x93\x94C\x04\x07\xe7\x08\x01\x94\x85\x94R\x94\x8c\rviral_library\x94\x8c\x14pdmH1N1_lib2023_loes\x94\x8c\x11neut_standard_set\x94\x8c\x08loes2023\x94\x8c\x0bsamples_csv\x94\x8c\x1edata/plates/plate2_samples.csv\x94\x8c\x0cmanual_drops\x94}\x94\x8c\rqc_thresholds\x94}\x94(h\xd4K\xfah\xd5G?tz\xe1G\xae\x14{h\xd6}\x94(h\xd8G?@bM\xd2\xf1\xa9\xfch\xd9K\x03h\xdaK\x02uh\xdb}\x94(h\xddG?tz\xe1G\xae\x14{h\xdeK\x03h\xdfK\x02uh\xe0M\xf4\x01h\xe1K\x1eh\xe2K\x05h\xe3K\x06u\x8c\x0fcurvefit_params\x94}\x94(h\xe6K\x01h\xe7h\xe8h\xe9K\x00h\xeah\xebu\x8c\x0bcurvefit_qc\x94}\x94(h\xeeK\x00h\xef}\x94(h\xf1G?\xe6ffffffh\xf2G?\xb9\x99\x99\x99\x99\x99\x9auh\xf3h\xf4h\xf5h\xf6u\x8c\x1eillumina_barcode_parser_params\x94}\x94(\x8c\tupstream2\x94\x8c\x06GCTACA\x94\x8c\x12upstream2_mismatch\x94K\x01uuh^}\x94(\x8c\x05group\x94\x8c\x05serum\x94\x8c\x04date\x94h\xffC\x04\x07\xe7\t\x1a\x94\x85\x94R\x94\x8c\rviral_library\x94\x8c\x14pdmH1N1_lib2023_loes\x94\x8c\x11neut_standard_set\x94\x8c\x08loes2023\x94\x8c\x0bsamples_csv\x94\x8c\x1fdata/plates/plate11_samples.csv\x94\x8c\x0cmanual_drops\x94}\x94\x8c\x18barcode_serum_replicates\x94]\x94]\x94(\x8c\x10AGTCCTATCCTCAAAT\x94\x8c\x06M099d0\x94eas\x8c\rqc_thresholds\x94}\x94(h\xd4K\xfah\xd5G?tz\xe1G\xae\x14{h\xd6}\x94(h\xd8G?@bM\xd2\xf1\xa9\xfch\xd9K\x03h\xdaK\x02uh\xdb}\x94(h\xddG?tz\xe1G\xae\x14{h\xdeK\x03h\xdfK\x02uh\xe0M\xf4\x01h\xe1K\x1eh\xe2K\x05h\xe3K\x06u\x8c\x0fcurvefit_params\x94}\x94(h\xe6K\x01h\xe7h\xe8h\xe9K\x00h\xeah\xebu\x8c\x0bcurvefit_qc\x94}\x94(h\xeeK\x00h\xef}\x94(h\xf1G?\xe6ffffffh\xf2G?\xb9\x99\x99\x99\x99\x99\x9auh\xf3h\xf4h\xf5]\x94]\x94(\x8c\x10AGGTCAAGACCACAGG\x94\x8c\x06M099d0\x94eau\x8c\x1eillumina_barcode_parser_params\x94}\x94(\x8c\tupstream2\x94\x8c\x06ATCGAT\x94\x8c\x12upstream2_mismatch\x94K\x01uuh_}\x94(\x8c\x05group\x94\x8c\x05pilot\x94\x8c\x04date\x94h\xffC\x04\x07\xe8\x03\x04\x94\x85\x94R\x94\x8c\rviral_library\x94\x8c\x13H3N2_lib2023_Kikawa\x94\x8c\x11neut_standard_set\x94\x8c\x08loes2023\x94\x8c\x0bsamples_csv\x94\x8c\x1cdata/plates/H3N2_samples.csv\x94\x8c\x0cmanual_drops\x94}\x94\x8c\rqc_thresholds\x94}\x94(h\xd4K\xfah\xd5G?tz\xe1G\xae\x14{h\xd6}\x94(h\xd8G?@bM\xd2\xf1\xa9\xfch\xd9K\x03h\xdaK\x02uh\xdb}\x94(h\xddG?tz\xe1G\xae\x14{h\xdeK\x03h\xdfK\x02uh\xe0M\xf4\x01h\xe1K\x1eh\xe2K\x05h\xe3K\x06u\x8c\x0fcurvefit_params\x94}\x94(h\xe6K\x01h\xe7h\xe8h\xe9K\x00h\xeah\xebu\x8c\x0bcurvefit_qc\x94}\x94(h\xeeK\x00h\xef}\x94(h\xf1G?\xe6ffffffh\xf2G?\xb9\x99\x99\x99\x99\x99\x9auh\xf3h\xf4h\xf5h\xf6uuu\x8c\x16default_serum_titer_as\x94\x8c\x08midpoint\x94\x8c\x1bdefault_serum_qc_thresholds\x94}\x94(\x8c\x0emin_replicates\x94K\x02\x8c\x1bmax_fold_change_from_median\x94K\x03\x8c\x11viruses_ignore_qc\x94]\x94u\x8c\x16sera_override_defaults\x94}\x94\x8c\x05serum\x94}\x94(\x8c\x07M099d30\x94}\x94\x8c\rqc_thresholds\x94}\x94(j[\x01\x00\x00K\x02j\\\x01\x00\x00K\x03j]\x01\x00\x00]\x94\x8c\x14A/Belgium/H0017/2022\x94aus\x8c\x07Y044d30\x94}\x94(\x8c\rqc_thresholds\x94}\x94(j[\x01\x00\x00K\x02j\\\x01\x00\x00K\x04j]\x01\x00\x00j^\x01\x00\x00u\x8c\x08titer_as\x94\x8c\x04nt50\x94uus\x8c\x14miscellaneous_plates\x94}\x94\x8c\x0erandom_plate_1\x94}\x94(\x8c\x04date\x94h\xffC\x04\x07\xe7\x08\x01\x94\x85\x94R\x94\x8c\rviral_library\x94\x8c\x14pdmH1N1_lib2023_loes\x94\x8c\x11neut_standard_set\x94\x8c\x08loes2023\x94\x8c\x0bsamples_csv\x94\x8c,data/miscellaneous_plates/random_plate_1.csv\x94\x8c\x1eillumina_barcode_parser_params\x94}\x94(\x8c\tupstream2\x94\x8c\x06GCTACA\x94\x8c\x12upstream2_mismatch\x94K\x01uusu\x8c\x04rule\x94\x8c\x12aggregate_qc_drops\x94\x8c\x0fbench_iteration\x94N\x8c\tscriptdir\x94\x8cO/fh/fast/bloom_j/computational_notebooks/jbloom/2024/seqneut-pipeline/notebooks\x94ub.'); from snakemake.logging import logger; logger.printshellcmds = False; import os; os.chdir(r'/fh/fast/bloom_j/computational_notebooks/jbloom/2024/seqneut-pipeline/test_example');
######## snakemake preamble end #########
Aggregate and analyze the drops from QC-ing the plates and sera¶
import altair as alt
import pandas as pd
from ruamel.yaml import YAML
yaml = YAML(typ="rt")
_ = alt.data_transformers.disable_max_rows()
Get variables from snakemake
:
input_plate_qc_drops = snakemake.input.plate_qc_drops
input_groups_sera_qc_drops = snakemake.input.groups_sera_qc_drops
output_plate_qc_drops = snakemake.output.plate_qc_drops
output_groups_sera_qc_drops = snakemake.output.groups_sera_qc_drops
plates = snakemake.params.plates
groups_sera = snakemake.params.groups_sera
Analyze plate QC drops¶
Read QC drops for individual plates into a merged dictionary, write it to YAML, and also convert to a DataFrame. If you really want to look into the details of what is being dropped, you will want to look at that merged YAML file.
# read dictionary of QC drops
assert len(plates) == len(input_plate_qc_drops)
plate_qc_drops = {}
for plate, qc_drops_yaml in zip(plates, input_plate_qc_drops):
with open(qc_drops_yaml) as f:
plate_qc_drops[plate] = yaml.load(f)
assert len(plate_qc_drops) == len(input_plate_qc_drops)
print(f"Writing merged plate drops to {output_plate_qc_drops}")
with open(output_plate_qc_drops, "w") as f:
yaml.dump(plate_qc_drops, stream=f)
# convert dictionary of QC drops into list of tuples
plate_qc_drop_tups = [
(plate_key, droptype_key, drop_key, reason)
for (plate_key, plate_val) in plate_qc_drops.items()
for droptype_key, droptype_val in plate_val.items()
for drop_key, reason in droptype_val.items()
]
# create data frame of QC drops
plate_qc_drops_df = pd.DataFrame(
plate_qc_drop_tups,
columns=["plate", "drop type", "drop", "reason"],
)
Writing merged plate drops to results/qc_drops/plate_qc_drops.yml
plate_qc_drop_counts = plate_qc_drops_df.groupby(
["plate", "drop type", "reason"], as_index=False
).aggregate(n_drops=pd.NamedAgg("drop", "nunique"))
assert plate_qc_drop_counts["n_drops"].sum() == len(plate_qc_drops_df)
Now plot the number of drops for each plate. You should be worried (maybe re-do or discard) any plates with a very large number of drops:
plate_selection = alt.selection_point(fields=["plate"], on="mouseover", empty=False)
plate_qc_drop_counts_chart = (
alt.Chart(plate_qc_drop_counts)
.add_params(plate_selection)
.encode(
alt.X(
"n_drops",
title="number of drops",
),
alt.Y(
"plate",
sort=plates,
title=None,
axis=alt.Axis(labelFontStyle="bold", labelFontSize=11),
),
alt.Column(
"drop type",
title=None,
spacing=5,
header=alt.Header(labelFontSize=12, labelFontStyle="bold", labelPadding=1),
),
alt.Color(
"reason",
legend=alt.Legend(
orient="top", columns=1, labelLimit=230, title=None, padding=1
),
),
strokeWidth=alt.condition(plate_selection, alt.value(3), alt.value(0.5)),
tooltip=plate_qc_drop_counts.columns.tolist(),
)
.mark_bar(height={"band": 0.8}, stroke="black")
.properties(
width=230,
height=alt.Step(16),
title=alt.TitleParams(
"Number of QC drops when processing plates", anchor="middle", dy=-2
),
)
.configure_axis(grid=False)
.resolve_scale(color="independent", x="independent")
)
plate_qc_drop_counts_chart
Look for barcodes dropped especially often in plate QC¶
If a barcode is dropped especially often across plates, that could indicate something problematic with that barcode such that it should be removed altogether from the library analysis.
barcode_drops = (
plate_qc_drops_df.query("`drop type`.str.startswith('barcode')")
.assign(barcode=lambda x: x["drop"].str.split().str[0])
.groupby(["drop type", "barcode"], as_index=False)
.aggregate(
plates_where_dropped=pd.NamedAgg("plate", "nunique"),
total_drops=pd.NamedAgg("plate", "count"),
)
)
barcode_selection = alt.selection_point(fields=["barcode"], on="mouseover", empty=False)
barcode_drops_chart = (
alt.Chart(barcode_drops)
.add_params(barcode_selection)
.encode(
alt.X(
"total_drops",
title="times barcode dropped",
),
alt.Y(
"barcode",
sort=alt.SortField("total_drops", order="descending"),
axis=alt.Axis(labelFontSize=9),
),
alt.Column(
"drop type",
title=None,
spacing=8,
header=alt.Header(labelFontSize=12, labelFontStyle="bold", labelPadding=1),
),
strokeWidth=alt.condition(barcode_selection, alt.value(3), alt.value(0.5)),
tooltip=barcode_drops.columns.tolist(),
)
.mark_bar(height={"band": 0.8}, stroke="black")
.properties(
width=200,
height=alt.Step(10),
title=alt.TitleParams(
"Number of QC drops when processing plates", anchor="middle", dy=-2
),
)
.configure_axis(grid=False)
.resolve_scale(color="independent", x="independent", y="independent")
)
barcode_drops_chart
Analyze the groups/sera QC¶
Analyze the QC performed on the groups/sera, which involves completely dropping titers for certain virus-sera pairs.
Read the QC for different groups/sera into a merged dictionary, write it to YAML, and also convert to a DataFrame. If you really want to look into the details of what is being dropped, you will want to look at that merged YAML file.
# read dictionary of QC drops
assert len(groups_sera) == len(input_groups_sera_qc_drops)
groups_sera_qc_drops = {}
for (group, serum), qc_drops_yaml in zip(groups_sera, input_groups_sera_qc_drops):
if group not in groups_sera_qc_drops:
groups_sera_qc_drops[group] = {}
with open(qc_drops_yaml) as f:
groups_sera_qc_drops[group][serum] = yaml.load(f)
print(f"Writing merged groups/sera drops to {output_groups_sera_qc_drops}")
with open(output_groups_sera_qc_drops, "w") as f:
yaml.dump(groups_sera_qc_drops, stream=f)
# convert dictionary of QC drops into list of tuples
groups_sera_qc_drop_tups = [
(group_key, serum_key, virus, reason)
for (group_key, group_val) in groups_sera_qc_drops.items()
for (serum_key, serum_val) in group_val.items()
for virus, reason in serum_val.items()
]
# create data frame of QC drops
groups_sera_qc_drops_df = pd.DataFrame(
groups_sera_qc_drop_tups,
columns=["group", "serum", "virus", "reason"],
)
Writing merged groups/sera drops to results/qc_drops/groups_sera_qc_drops.yml
Plot the number of viruses dropped for each group/serum. If a group/serum has many missed viruses, then you will lack a lot of titers and so it may be worth reviewing the cause of the drops.
groups_sera_n_drops = groups_sera_qc_drops_df.groupby(
["group", "serum", "reason"], as_index=False
).aggregate(n_viruses=pd.NamedAgg("virus", "nunique"))
assert groups_sera_n_drops["n_viruses"].sum() == len(groups_sera_qc_drops_df)
groups_sera_n_drops_chart = (
alt.Chart(groups_sera_n_drops)
.encode(
alt.X("n_viruses", title="number of viruses dropped"),
alt.Y("serum"),
alt.Row("group"),
alt.Color("reason", title="reason dropped", legend=alt.Legend(labelLimit=350)),
tooltip=groups_sera_n_drops.columns.tolist(),
)
.mark_bar(height={"band": 0.8})
.properties(
width=250,
height=alt.Step(13),
title="Number of viruses dropped at serum QC for each serum",
)
.configure_axis(grid=False)
.resolve_scale(y="independent", x="independent")
)
groups_sera_n_drops_chart
Plot the number of sera for which each virus is dropped during serum QC. If a virus is dropped for many sera, that may indicate some issue with that virus in assays:
virus_n_drops = groups_sera_qc_drops_df.groupby(
["group", "virus", "reason"], as_index=False
).aggregate(n_sera=pd.NamedAgg("serum", "nunique"))
assert virus_n_drops["n_sera"].sum() == len(groups_sera_qc_drops_df)
virus_n_drops_chart = (
alt.Chart(virus_n_drops)
.encode(
alt.X("n_sera", title="number of sera for which virus is dropped"),
alt.Y("virus", sort=alt.SortField("n_sera", order="descending")),
alt.Row("group"),
alt.Color("reason", title="reason dropped", legend=alt.Legend(labelLimit=350)),
tooltip=virus_n_drops.columns.tolist(),
)
.mark_bar(height={"band": 0.8})
.properties(
width=250,
height=alt.Step(13),
title="Number of sera for which each virus is dropped at serum QC",
)
.configure_axis(grid=False)
.resolve_scale(y="independent", x="independent")
)
virus_n_drops_chart