In [1]:
######## snakemake preamble start (automatically inserted, do not edit) ########
import sys; sys.path.extend(['/fh/fast/bloom_j/software/miniconda3/envs/seqneut-pipeline/lib/python3.12/site-packages', '/fh/fast/bloom_j/computational_notebooks/jbloom/2024/seqneut-pipeline', '/fh/fast/bloom_j/computational_notebooks/jbloom/2024/seqneut-pipeline/test_example/..', '/fh/fast/bloom_j/computational_notebooks/jbloom/2024/seqneut-pipeline/test_example', '/fh/fast/bloom_j/software/miniconda3/envs/seqneut-pipeline/bin', '/fh/fast/bloom_j/software/miniconda3/envs/seqneut-pipeline/lib/python3.12', '/fh/fast/bloom_j/software/miniconda3/envs/seqneut-pipeline/lib/python3.12/lib-dynload', '/fh/fast/bloom_j/software/miniconda3/envs/seqneut-pipeline/lib/python3.12/site-packages', '/home/jbloom/.cache/snakemake/snakemake/source-cache/runtime-cache/tmpxokz18n6/file/fh/fast/bloom_j/computational_notebooks/jbloom/2024/seqneut-pipeline/notebooks', '/fh/fast/bloom_j/computational_notebooks/jbloom/2024/seqneut-pipeline/notebooks']); import pickle; snakemake = pickle.loads(b'\x80\x04\x95V(\x00\x00\x00\x00\x00\x00\x8c\x10snakemake.script\x94\x8c\tSnakemake\x94\x93\x94)\x81\x94}\x94(\x8c\x05input\x94\x8c\x0csnakemake.io\x94\x8c\nInputFiles\x94\x93\x94)\x81\x94(\x8c:results/barcode_counts/H3N2_plate_A23038d0_r32_75K_100.csv\x94\x8c:results/barcode_counts/H3N2_plate_A23038d0_r32_75K_200.csv\x94\x8c:results/barcode_counts/H3N2_plate_A23038d0_r32_75K_400.csv\x94\x8c:results/barcode_counts/H3N2_plate_A23038d0_r32_75K_800.csv\x94\x8c;results/barcode_counts/H3N2_plate_A23038d0_r32_75K_1600.csv\x94\x8c;results/barcode_counts/H3N2_plate_A23038d0_r32_75K_3200.csv\x94\x8c;results/barcode_counts/H3N2_plate_A23038d0_r32_75K_6400.csv\x94\x8c<results/barcode_counts/H3N2_plate_A23038d0_r32_75K_12800.csv\x94\x8c,results/barcode_counts/H3N2_plate_none-1.csv\x94\x8c,results/barcode_counts/H3N2_plate_none-2.csv\x94\x8c,results/barcode_counts/H3N2_plate_none-3.csv\x94\x8c,results/barcode_counts/H3N2_plate_none-4.csv\x94\x8c,results/barcode_counts/H3N2_plate_none-5.csv\x94\x8c,results/barcode_counts/H3N2_plate_none-6.csv\x94\x8c,results/barcode_counts/H3N2_plate_none-7.csv\x94\x8c,results/barcode_counts/H3N2_plate_none-8.csv\x94\x8c9results/barcode_fates/H3N2_plate_A23038d0_r32_75K_100.csv\x94\x8c9results/barcode_fates/H3N2_plate_A23038d0_r32_75K_200.csv\x94\x8c9results/barcode_fates/H3N2_plate_A23038d0_r32_75K_400.csv\x94\x8c9results/barcode_fates/H3N2_plate_A23038d0_r32_75K_800.csv\x94\x8c:results/barcode_fates/H3N2_plate_A23038d0_r32_75K_1600.csv\x94\x8c:results/barcode_fates/H3N2_plate_A23038d0_r32_75K_3200.csv\x94\x8c:results/barcode_fates/H3N2_plate_A23038d0_r32_75K_6400.csv\x94\x8c;results/barcode_fates/H3N2_plate_A23038d0_r32_75K_12800.csv\x94\x8c+results/barcode_fates/H3N2_plate_none-1.csv\x94\x8c+results/barcode_fates/H3N2_plate_none-2.csv\x94\x8c+results/barcode_fates/H3N2_plate_none-3.csv\x94\x8c+results/barcode_fates/H3N2_plate_none-4.csv\x94\x8c+results/barcode_fates/H3N2_plate_none-5.csv\x94\x8c+results/barcode_fates/H3N2_plate_none-6.csv\x94\x8c+results/barcode_fates/H3N2_plate_none-7.csv\x94\x8c+results/barcode_fates/H3N2_plate_none-8.csv\x94\x8c)data/viral_libraries/2023_H3N2_Kikawa.csv\x94\x8c3data/neut_standard_sets/loes2023_neut_standards.csv\x94e}\x94(\x8c\x06_names\x94}\x94(\x8c\ncount_csvs\x94K\x00K\x10\x86\x94\x8c\tfate_csvs\x94K\x10K \x86\x94\x8c\x11viral_library_csv\x94K N\x86\x94\x8c\x15neut_standard_set_csv\x94K!N\x86\x94u\x8c\x12_allowed_overrides\x94]\x94(\x8c\x05index\x94\x8c\x04sort\x94eh9\x8c\tfunctools\x94\x8c\x07partial\x94\x93\x94h\x06\x8c\x19Namedlist._used_attribute\x94\x93\x94\x85\x94R\x94(h?)}\x94\x8c\x05_name\x94h9sNt\x94bh:h=h?\x85\x94R\x94(h?)}\x94hCh:sNt\x94bh/h\x06\x8c\tNamedlist\x94\x93\x94)\x81\x94(h\nh\x0bh\x0ch\rh\x0eh\x0fh\x10h\x11h\x12h\x13h\x14h\x15h\x16h\x17h\x18h\x19e}\x94(h-}\x94h7]\x94(h9h:eh9h=h?\x85\x94R\x94(h?)}\x94hCh9sNt\x94bh:h=h?\x85\x94R\x94(h?)}\x94hCh:sNt\x94bubh1hJ)\x81\x94(h\x1ah\x1bh\x1ch\x1dh\x1eh\x1fh h!h"h#h$h%h&h\'h(h)e}\x94(h-}\x94h7]\x94(h9h:eh9h=h?\x85\x94R\x94(h?)}\x94hCh9sNt\x94bh:h=h?\x85\x94R\x94(h?)}\x94hCh:sNt\x94bubh3h*h5h+ub\x8c\x06output\x94h\x06\x8c\x0bOutputFiles\x94\x93\x94)\x81\x94(\x8c&results/plates/H3N2_plate/qc_drops.yml\x94\x8c.results/plates/H3N2_plate/frac_infectivity.csv\x94\x8c\'results/plates/H3N2_plate/curvefits.csv\x94\x8c*results/plates/H3N2_plate/curvefits.pickle\x94e}\x94(h-}\x94(\x8c\x08qc_drops\x94K\x00N\x86\x94\x8c\x14frac_infectivity_csv\x94K\x01N\x86\x94\x8c\x08fits_csv\x94K\x02N\x86\x94\x8c\x0bfits_pickle\x94K\x03N\x86\x94uh7]\x94(h9h:eh9h=h?\x85\x94R\x94(h?)}\x94hCh9sNt\x94bh:h=h?\x85\x94R\x94(h?)}\x94hCh:sNt\x94bhmhghohhhqhihshjub\x8c\x06params\x94h\x06\x8c\x06Params\x94\x93\x94)\x81\x94(]\x94(\x8c\x1fH3N2_plate_A23038d0_r32_75K_100\x94\x8c\x1fH3N2_plate_A23038d0_r32_75K_200\x94\x8c\x1fH3N2_plate_A23038d0_r32_75K_400\x94\x8c\x1fH3N2_plate_A23038d0_r32_75K_800\x94\x8c H3N2_plate_A23038d0_r32_75K_1600\x94\x8c H3N2_plate_A23038d0_r32_75K_3200\x94\x8c H3N2_plate_A23038d0_r32_75K_6400\x94\x8c!H3N2_plate_A23038d0_r32_75K_12800\x94\x8c\x11H3N2_plate_none-1\x94\x8c\x11H3N2_plate_none-2\x94\x8c\x11H3N2_plate_none-3\x94\x8c\x11H3N2_plate_none-4\x94\x8c\x11H3N2_plate_none-5\x94\x8c\x11H3N2_plate_none-6\x94\x8c\x11H3N2_plate_none-7\x94\x8c\x11H3N2_plate_none-8\x94e}\x94(\x8c\x05group\x94\x8c\x05pilot\x94\x8c\x04date\x94\x8c\n2024-03-04\x94\x8c\rviral_library\x94\x8c\x13H3N2_lib2023_Kikawa\x94\x8c\x11neut_standard_set\x94\x8c\x08loes2023\x94\x8c\x0bsamples_csv\x94\x8c\x1cdata/plates/H3N2_samples.csv\x94\x8c\x0cmanual_drops\x94}\x94\x8c\rqc_thresholds\x94}\x94(\x8c\x1bavg_barcode_counts_per_well\x94K\xfa\x8c\x1fmin_neut_standard_frac_per_well\x94G?tz\xe1G\xae\x14{\x8c"no_serum_per_viral_barcode_filters\x94}\x94(\x8c\x08min_frac\x94G?@bM\xd2\xf1\xa9\xfc\x8c\x0fmax_fold_change\x94K\x03\x8c\tmax_wells\x94K\x02u\x8c!per_neut_standard_barcode_filters\x94}\x94(\x8c\x08min_frac\x94G?tz\xe1G\xae\x14{\x8c\x0fmax_fold_change\x94K\x03\x8c\tmax_wells\x94K\x02u\x8c min_neut_standard_count_per_well\x94M\xf4\x01\x8c)min_no_serum_count_per_viral_barcode_well\x94K\x1e\x8c+max_frac_infectivity_per_viral_barcode_well\x94K\x05\x8c)min_dilutions_per_barcode_serum_replicate\x94K\x06u\x8c\x0fcurvefit_params\x94}\x94(\x8c\x18frac_infectivity_ceiling\x94K\x01\x8c\x06fixtop\x94]\x94(G?\xe8\x00\x00\x00\x00\x00\x00K\x01e\x8c\tfixbottom\x94K\x00\x8c\x08fixslope\x94]\x94(G?\xe9\x99\x99\x99\x99\x99\x9aK\neu\x8c\x0bcurvefit_qc\x94}\x94(\x8c\x1dmax_frac_infectivity_at_least\x94K\x00\x8c\x0fgoodness_of_fit\x94}\x94(\x8c\x06min_R2\x94G?\xe6ffffff\x8c\x08max_RMSD\x94G?\xb9\x99\x99\x99\x99\x99\x9au\x8c#serum_replicates_ignore_curvefit_qc\x94]\x94\x8c+barcode_serum_replicates_ignore_curvefit_qc\x94]\x94u\x8c\x1eillumina_barcode_parser_params\x94}\x94(\x8c\x08upstream\x94\x8c\x1fCTCCCTACAATGTCGGATTTGTATTTAATAG\x94\x8c\ndownstream\x94\x8c\x00\x94\x8c\x04minq\x94K\x14\x8c\x11upstream_mismatch\x94K\x04\x8c\x0ebc_orientation\x94\x8c\x02R2\x94u\x8c\x07samples\x94}\x94(\x8c\x04well\x94}\x94(K\x00\x8c\x02A7\x94K\x01\x8c\x02B7\x94K\x02\x8c\x02C7\x94K\x03\x8c\x02D7\x94K\x04\x8c\x02E7\x94K\x05\x8c\x02F7\x94K\x06\x8c\x02G7\x94K\x07\x8c\x02H7\x94K\x08\x8c\x03A10\x94K\t\x8c\x03B10\x94K\n\x8c\x03C10\x94K\x0b\x8c\x03D10\x94K\x0c\x8c\x03E10\x94K\r\x8c\x03F10\x94K\x0e\x8c\x03G10\x94K\x0f\x8c\x03H10\x94u\x8c\x05serum\x94}\x94(K\x00\x8c\x10A23038d0_r32_75K\x94K\x01h\xe5K\x02h\xe5K\x03h\xe5K\x04h\xe5K\x05h\xe5K\x06h\xe5K\x07h\xe5K\x08\x8c\x04none\x94K\th\xe6K\nh\xe6K\x0bh\xe6K\x0ch\xe6K\rh\xe6K\x0eh\xe6K\x0fh\xe6u\x8c\x0fdilution_factor\x94}\x94(K\x00KdK\x01K\xc8K\x02M\x90\x01K\x03M \x03K\x04M@\x06K\x05M\x80\x0cK\x06M\x00\x19K\x07M\x002K\x08NK\tNK\nNK\x0bNK\x0cNK\rNK\x0eNK\x0fNu\x8c\treplicate\x94}\x94(K\x00K\x01K\x01K\x01K\x02K\x01K\x03K\x01K\x04K\x01K\x05K\x01K\x06K\x01K\x07K\x01K\x08K\x01K\tK\x02K\nK\x03K\x0bK\x04K\x0cK\x05K\rK\x06K\x0eK\x07K\x0fK\x08u\x8c\x05fastq\x94}\x94(K\x00\x8c1fastqs/A23038d0_r32_75K_conc1_S49_R1_001.fastq.gz\x94K\x01\x8c1fastqs/A23038d0_r32_75K_conc2_S50_R1_001.fastq.gz\x94K\x02\x8c1fastqs/A23038d0_r32_75K_conc3_S51_R1_001.fastq.gz\x94K\x03\x8c1fastqs/A23038d0_r32_75K_conc4_S52_R1_001.fastq.gz\x94K\x04\x8c1fastqs/A23038d0_r32_75K_conc5_S53_R1_001.fastq.gz\x94K\x05\x8c1fastqs/A23038d0_r32_75K_conc6_S54_R1_001.fastq.gz\x94K\x06\x8c1fastqs/A23038d0_r32_75K_conc7_S55_R1_001.fastq.gz\x94K\x07\x8c1fastqs/A23038d0_r32_75K_conc8_S56_R1_001.fastq.gz\x94K\x08\x8c4fastqs/A23038d0_r32_75K_noSerum1_S73_R1_001.fastq.gz\x94K\t\x8c4fastqs/A23038d0_r32_75K_noSerum2_S74_R1_001.fastq.gz\x94K\n\x8c4fastqs/A23038d0_r32_75K_noSerum3_S75_R1_001.fastq.gz\x94K\x0b\x8c4fastqs/A23038d0_r32_75K_noSerum4_S76_R1_001.fastq.gz\x94K\x0c\x8c4fastqs/A23038d0_r32_75K_noSerum5_S77_R1_001.fastq.gz\x94K\r\x8c4fastqs/A23038d0_r32_75K_noSerum6_S78_R1_001.fastq.gz\x94K\x0e\x8c4fastqs/A23038d0_r32_75K_noSerum7_S79_R1_001.fastq.gz\x94K\x0f\x8c4fastqs/A23038d0_r32_75K_noSerum8_S80_R1_001.fastq.gz\x94u\x8c\x0fserum_replicate\x94}\x94(K\x00h\xe5K\x01h\xe5K\x02h\xe5K\x03h\xe5K\x04h\xe5K\x05h\xe5K\x06h\xe5K\x07h\xe5K\x08\x8c\x06none-1\x94K\t\x8c\x06none-2\x94K\n\x8c\x06none-3\x94K\x0b\x8c\x06none-4\x94K\x0c\x8c\x06none-5\x94K\r\x8c\x06none-6\x94K\x0e\x8c\x06none-7\x94K\x0f\x8c\x06none-8\x94u\x8c\x0esample_noplate\x94}\x94(K\x00\x8c\x14A23038d0_r32_75K_100\x94K\x01\x8c\x14A23038d0_r32_75K_200\x94K\x02\x8c\x14A23038d0_r32_75K_400\x94K\x03\x8c\x14A23038d0_r32_75K_800\x94K\x04\x8c\x15A23038d0_r32_75K_1600\x94K\x05\x8c\x15A23038d0_r32_75K_3200\x94K\x06\x8c\x15A23038d0_r32_75K_6400\x94K\x07\x8c\x16A23038d0_r32_75K_12800\x94K\x08h\xffK\tj\x00\x01\x00\x00K\nj\x01\x01\x00\x00K\x0bj\x02\x01\x00\x00K\x0cj\x03\x01\x00\x00K\rj\x04\x01\x00\x00K\x0ej\x05\x01\x00\x00K\x0fj\x06\x01\x00\x00u\x8c\x06sample\x94}\x94(K\x00h\x83K\x01h\x84K\x02h\x85K\x03h\x86K\x04h\x87K\x05h\x88K\x06h\x89K\x07h\x8aK\x08h\x8bK\th\x8cK\nh\x8dK\x0bh\x8eK\x0ch\x8fK\rh\x90K\x0eh\x91K\x0fh\x92u\x8c\x05plate\x94}\x94(K\x00\x8c\nH3N2_plate\x94K\x01j\x15\x01\x00\x00K\x02j\x15\x01\x00\x00K\x03j\x15\x01\x00\x00K\x04j\x15\x01\x00\x00K\x05j\x15\x01\x00\x00K\x06j\x15\x01\x00\x00K\x07j\x15\x01\x00\x00K\x08j\x15\x01\x00\x00K\tj\x15\x01\x00\x00K\nj\x15\x01\x00\x00K\x0bj\x15\x01\x00\x00K\x0cj\x15\x01\x00\x00K\rj\x15\x01\x00\x00K\x0ej\x15\x01\x00\x00K\x0fj\x15\x01\x00\x00u\x8c\x0fplate_replicate\x94}\x94(K\x00j\x15\x01\x00\x00K\x01j\x15\x01\x00\x00K\x02j\x15\x01\x00\x00K\x03j\x15\x01\x00\x00K\x04j\x15\x01\x00\x00K\x05j\x15\x01\x00\x00K\x06j\x15\x01\x00\x00K\x07j\x15\x01\x00\x00K\x08\x8c\x0cH3N2_plate-1\x94K\t\x8c\x0cH3N2_plate-2\x94K\n\x8c\x0cH3N2_plate-3\x94K\x0b\x8c\x0cH3N2_plate-4\x94K\x0c\x8c\x0cH3N2_plate-5\x94K\r\x8c\x0cH3N2_plate-6\x94K\x0e\x8c\x0cH3N2_plate-7\x94K\x0f\x8c\x0cH3N2_plate-8\x94uuue}\x94(h-}\x94(h\xcfK\x00N\x86\x94\x8c\x0cplate_params\x94K\x01N\x86\x94uh7]\x94(h9h:eh9h=h?\x85\x94R\x94(h?)}\x94hCh9sNt\x94bh:h=h?\x85\x94R\x94(h?)}\x94hCh:sNt\x94bh\xcfh\x82j#\x01\x00\x00h\x93ub\x8c\twildcards\x94h\x06\x8c\tWildcards\x94\x93\x94)\x81\x94\x8c\nH3N2_plate\x94a}\x94(h-}\x94\x8c\x05plate\x94K\x00N\x86\x94sh7]\x94(h9h:eh9h=h?\x85\x94R\x94(h?)}\x94hCh9sNt\x94bh:h=h?\x85\x94R\x94(h?)}\x94hCh:sNt\x94bj\x13\x01\x00\x00j2\x01\x00\x00ub\x8c\x07threads\x94K\x01\x8c\tresources\x94h\x06\x8c\tResources\x94\x93\x94)\x81\x94(K\x01K\x01\x8c\x04/tmp\x94e}\x94(h-}\x94(\x8c\x06_cores\x94K\x00N\x86\x94\x8c\x06_nodes\x94K\x01N\x86\x94\x8c\x06tmpdir\x94K\x02N\x86\x94uh7]\x94(h9h:eh9h=h?\x85\x94R\x94(h?)}\x94hCh9sNt\x94bh:h=h?\x85\x94R\x94(h?)}\x94hCh:sNt\x94bjH\x01\x00\x00K\x01jJ\x01\x00\x00K\x01jL\x01\x00\x00jE\x01\x00\x00ub\x8c\x03log\x94h\x06\x8c\x03Log\x94\x93\x94)\x81\x94\x8c2results/plates/H3N2_plate/process_H3N2_plate.ipynb\x94a}\x94(h-}\x94\x8c\x08notebook\x94K\x00N\x86\x94sh7]\x94(h9h:eh9h=h?\x85\x94R\x94(h?)}\x94hCh9sNt\x94bh:h=h?\x85\x94R\x94(h?)}\x94hCh:sNt\x94bj^\x01\x00\x00j[\x01\x00\x00ub\x8c\x06config\x94}\x94(\x8c\x10seqneut-pipeline\x94\x8c\x03../\x94\x8c\x04docs\x94\x8c\x07../docs\x94\x8c\x0bdescription\x94X\xba\x01\x00\x00# Test example for [seqneut-pipeline](https://github.com/jbloomlab/seqneut-pipeline)\nThis is a small toy-example created by subsetting a real experiment dataset.\n\nSee [https://github.com/jbloomlab/seqneut-pipeline](https://github.com/jbloomlab/seqneut-pipeline)\nfor the computer code and underlying numerical data.\n\nSee [here](https://github.com/jbloomlab/seqneut-pipeline/graphs/contributors) for a\nlist of all contributors to the pipeline.\n\x94\x8c\x0fviral_libraries\x94}\x94(\x8c\x14pdmH1N1_lib2023_loes\x94\x8c-data/viral_libraries/pdmH1N1_lib2023_loes.csv\x94\x8c\x13H3N2_lib2023_Kikawa\x94\x8c)data/viral_libraries/2023_H3N2_Kikawa.csv\x94u\x8c\x17viral_strain_plot_order\x94\x8c data/viral_strain_plot_order.csv\x94\x8c\x12neut_standard_sets\x94}\x94\x8c\x08loes2023\x94\x8c3data/neut_standard_sets/loes2023_neut_standards.csv\x94s\x8c\x1eillumina_barcode_parser_params\x94}\x94(h\xc7h\xc8h\xc9h\xcah\xcbK\x14h\xccK\x04h\xcdh\xceu\x8c#default_process_plate_qc_thresholds\x94}\x94(h\xa2K\xfah\xa3G?tz\xe1G\xae\x14{h\xa4}\x94(h\xa6G?@bM\xd2\xf1\xa9\xfch\xa7K\x03h\xa8K\x02uh\xa9}\x94(h\xabG?tz\xe1G\xae\x14{h\xacK\x03h\xadK\x02uh\xaeM\xf4\x01h\xafK\x1eh\xb0K\x05h\xb1K\x06u\x8c%default_process_plate_curvefit_params\x94}\x94(h\xb4K\x01h\xb5]\x94(G?\xe8\x00\x00\x00\x00\x00\x00K\x01eh\xb7K\x00h\xb8]\x94(G?\xe9\x99\x99\x99\x99\x99\x9aK\neu\x8c!default_process_plate_curvefit_qc\x94}\x94(h\xbcK\x00h\xbd}\x94(h\xbfG?\xe6ffffffh\xc0G?\xb9\x99\x99\x99\x99\x99\x9auh\xc1]\x94h\xc3]\x94u\x8c\x06plates\x94}\x94(\x8c\x06plate2\x94}\x94(\x8c\x05group\x94\x8c\x05serum\x94\x8c\x04date\x94\x8c\x08datetime\x94\x8c\x04date\x94\x93\x94C\x04\x07\xe7\x08\x01\x94\x85\x94R\x94\x8c\rviral_library\x94\x8c\x14pdmH1N1_lib2023_loes\x94\x8c\x11neut_standard_set\x94\x8c\x08loes2023\x94\x8c\x0bsamples_csv\x94\x8c\x1edata/plates/plate2_samples.csv\x94\x8c\x0cmanual_drops\x94}\x94\x8c\rqc_thresholds\x94}\x94(h\xa2K\xfah\xa3G?tz\xe1G\xae\x14{h\xa4}\x94(h\xa6G?@bM\xd2\xf1\xa9\xfch\xa7K\x03h\xa8K\x02uh\xa9}\x94(h\xabG?tz\xe1G\xae\x14{h\xacK\x03h\xadK\x02uh\xaeM\xf4\x01h\xafK\x1eh\xb0K\x05h\xb1K\x06u\x8c\x0fcurvefit_params\x94}\x94(h\xb4K\x01h\xb5j\x85\x01\x00\x00h\xb7K\x00h\xb8j\x86\x01\x00\x00u\x8c\x0bcurvefit_qc\x94}\x94(h\xbcK\x00h\xbd}\x94(h\xbfG?\xe6ffffffh\xc0G?\xb9\x99\x99\x99\x99\x99\x9auh\xc1j\x8a\x01\x00\x00h\xc3j\x8b\x01\x00\x00u\x8c\x1eillumina_barcode_parser_params\x94}\x94(\x8c\tupstream2\x94\x8c\x06GCTACA\x94\x8c\x12upstream2_mismatch\x94K\x01uu\x8c\x07plate11\x94}\x94(\x8c\x05group\x94\x8c\x05serum\x94\x8c\x04date\x94j\x95\x01\x00\x00C\x04\x07\xe7\t\x1a\x94\x85\x94R\x94\x8c\rviral_library\x94\x8c\x14pdmH1N1_lib2023_loes\x94\x8c\x11neut_standard_set\x94\x8c\x08loes2023\x94\x8c\x0bsamples_csv\x94\x8c\x1fdata/plates/plate11_samples.csv\x94\x8c\x0cmanual_drops\x94}\x94\x8c\x18barcode_serum_replicates\x94]\x94]\x94(\x8c\x10AGTCCTATCCTCAAAT\x94\x8c\x06M099d0\x94eas\x8c\rqc_thresholds\x94}\x94(h\xa2K\xfah\xa3G?tz\xe1G\xae\x14{h\xa4}\x94(h\xa6G?@bM\xd2\xf1\xa9\xfch\xa7K\x03h\xa8K\x02uh\xa9}\x94(h\xabG?tz\xe1G\xae\x14{h\xacK\x03h\xadK\x02uh\xaeM\xf4\x01h\xafK\x1eh\xb0K\x05h\xb1K\x06u\x8c\x0fcurvefit_params\x94}\x94(h\xb4K\x01h\xb5j\x85\x01\x00\x00h\xb7K\x00h\xb8j\x86\x01\x00\x00u\x8c\x0bcurvefit_qc\x94}\x94(h\xbcK\x00h\xbd}\x94(h\xbfG?\xe6ffffffh\xc0G?\xb9\x99\x99\x99\x99\x99\x9auh\xc1j\x8a\x01\x00\x00h\xc3]\x94]\x94(\x8c\x10AGGTCAAGACCACAGG\x94\x8c\x06M099d0\x94eau\x8c\x1eillumina_barcode_parser_params\x94}\x94(\x8c\tupstream2\x94\x8c\x06ATCGAT\x94\x8c\x12upstream2_mismatch\x94K\x01uuj\x15\x01\x00\x00}\x94(h\x94h\x95h\x96j\x95\x01\x00\x00C\x04\x07\xe8\x03\x04\x94\x85\x94R\x94h\x98h\x99h\x9ah\x9bh\x9ch\x9dh\x9e}\x94h\xa0}\x94(h\xa2K\xfah\xa3G?tz\xe1G\xae\x14{h\xa4}\x94(h\xa6G?@bM\xd2\xf1\xa9\xfch\xa7K\x03h\xa8K\x02uh\xa9}\x94(h\xabG?tz\xe1G\xae\x14{h\xacK\x03h\xadK\x02uh\xaeM\xf4\x01h\xafK\x1eh\xb0K\x05h\xb1K\x06uh\xb2}\x94(h\xb4K\x01h\xb5j\x85\x01\x00\x00h\xb7K\x00h\xb8j\x86\x01\x00\x00uh\xba}\x94(h\xbcK\x00h\xbd}\x94(h\xbfG?\xe6ffffffh\xc0G?\xb9\x99\x99\x99\x99\x99\x9auh\xc1j\x8a\x01\x00\x00h\xc3j\x8b\x01\x00\x00uuu\x8c\x16default_serum_titer_as\x94\x8c\x08midpoint\x94\x8c\x1bdefault_serum_qc_thresholds\x94}\x94(\x8c\x0emin_replicates\x94K\x02\x8c\x1bmax_fold_change_from_median\x94K\x03\x8c\x11viruses_ignore_qc\x94]\x94u\x8c\x16sera_override_defaults\x94}\x94\x8c\x05serum\x94}\x94(\x8c\x07M099d30\x94}\x94\x8c\rqc_thresholds\x94}\x94(j\xe5\x01\x00\x00K\x02j\xe6\x01\x00\x00K\x03j\xe7\x01\x00\x00]\x94\x8c\x14A/Belgium/H0017/2022\x94aus\x8c\x07Y044d30\x94}\x94(\x8c\rqc_thresholds\x94}\x94(j\xe5\x01\x00\x00K\x02j\xe6\x01\x00\x00K\x04j\xe7\x01\x00\x00j\xe8\x01\x00\x00u\x8c\x08titer_as\x94\x8c\x04nt50\x94uus\x8c\x14miscellaneous_plates\x94}\x94\x8c\x0erandom_plate_1\x94}\x94(\x8c\x04date\x94j\x95\x01\x00\x00C\x04\x07\xe7\x08\x01\x94\x85\x94R\x94\x8c\rviral_library\x94\x8c\x14pdmH1N1_lib2023_loes\x94\x8c\x11neut_standard_set\x94\x8c\x08loes2023\x94\x8c\x0bsamples_csv\x94\x8c,data/miscellaneous_plates/random_plate_1.csv\x94\x8c\x1eillumina_barcode_parser_params\x94}\x94(\x8c\tupstream2\x94\x8c\x06GCTACA\x94\x8c\x12upstream2_mismatch\x94K\x01uusu\x8c\x04rule\x94\x8c\rprocess_plate\x94\x8c\x0fbench_iteration\x94N\x8c\tscriptdir\x94\x8cO/fh/fast/bloom_j/computational_notebooks/jbloom/2024/seqneut-pipeline/notebooks\x94ub.'); from snakemake.logging import logger; logger.printshellcmds = False; import os; os.chdir(r'/fh/fast/bloom_j/computational_notebooks/jbloom/2024/seqneut-pipeline/test_example');
######## snakemake preamble end #########

Process plate counts to get fraction infectivities and fit curves¶

This notebook is designed to be run using snakemake, and analyzes a plate of sequencing-based neutralization assays.

The plots generated by this notebook are interactive, so you can mouseover points for details, use the mouse-scroll to zoom and pan, and use interactive dropdowns at the bottom of the plots.

Setup¶

Import Python modules:

In [2]:
import pickle
import sys

import altair as alt

import matplotlib.pyplot as plt

import neutcurve

import numpy

import pandas as pd

import ruamel.yaml as yaml

_ = alt.data_transformers.disable_max_rows()

Get the variables passed by snakemake:

In [3]:
count_csvs = snakemake.input.count_csvs
fate_csvs = snakemake.input.fate_csvs
viral_library_csv = snakemake.input.viral_library_csv
neut_standard_set_csv = snakemake.input.neut_standard_set_csv
qc_drops_yaml = snakemake.output.qc_drops
frac_infectivity_csv = snakemake.output.frac_infectivity_csv
fits_csv = snakemake.output.fits_csv
fits_pickle = snakemake.output.fits_pickle
samples = snakemake.params.samples
plate = snakemake.wildcards.plate
plate_params = snakemake.params.plate_params

# get thresholds turning lists into tuples as needed
manual_drops = {
    filter_type: [tuple(w) if isinstance(w, list) else w for w in filter_drops]
    for (filter_type, filter_drops) in plate_params["manual_drops"].items()
}
group = plate_params["group"]
qc_thresholds = plate_params["qc_thresholds"]
curvefit_params = plate_params["curvefit_params"]
curvefit_qc = plate_params["curvefit_qc"]
curvefit_qc["barcode_serum_replicates_ignore_curvefit_qc"] = [
    tuple(w) for w in curvefit_qc["barcode_serum_replicates_ignore_curvefit_qc"]
]

print(f"Processing {plate=}")

samples_df = pd.DataFrame(plate_params["samples"])
print(f"\nPlate has {len(samples)} samples (wells)")
assert all(
    (len(samples_df) == samples_df[c].nunique())
    for c in ["well", "sample", "sample_noplate"]
)
assert len(samples_df) == len(
    samples_df.groupby(["serum_replicate", "dilution_factor"])
)
assert len(samples) == len(count_csvs) == len(fate_csvs) == len(samples_df)

for d, key, title in [
    (manual_drops, "manual_drops", "Data manually specified to drop:"),
    (qc_thresholds, "qc_thresholds", "QC thresholds applied to data:"),
    (curvefit_params, "curvefit_params", "Curve-fitting parameters:"),
    (curvefit_qc, "curvefit_qc", "Curve-fitting QC:"),
]:
    print(f"\n{title}")
    yaml.YAML(typ="rt").dump({key: d}, stream=sys.stdout)
Processing plate='H3N2_plate'

Plate has 16 samples (wells)

Data manually specified to drop:
manual_drops: {}
QC thresholds applied to data:
qc_thresholds:
  avg_barcode_counts_per_well: 250
  min_neut_standard_frac_per_well: 0.005
  no_serum_per_viral_barcode_filters:
    min_frac: 0.0005
    max_fold_change: 3
    max_wells: 2
  per_neut_standard_barcode_filters:
    min_frac: 0.005
    max_fold_change: 3
    max_wells: 2
  min_neut_standard_count_per_well: 500
  min_no_serum_count_per_viral_barcode_well: 30
  max_frac_infectivity_per_viral_barcode_well: 5
  min_dilutions_per_barcode_serum_replicate: 6
Curve-fitting parameters:
curvefit_params:
  frac_infectivity_ceiling: 1
  fixtop:
  - 0.75
  - 1
  fixbottom: 0
  fixslope:
  - 0.8
  - 10
Curve-fitting QC:
curvefit_qc:
  max_frac_infectivity_at_least: 0
  goodness_of_fit:
    min_R2: 0.7
    max_RMSD: 0.1
  serum_replicates_ignore_curvefit_qc: []
  barcode_serum_replicates_ignore_curvefit_qc: []

Set up dictionary to keep track of wells, barcodes, well-barcodes, and serum-replicates that are dropped:

In [4]:
qc_drops = {
    "wells": {},
    "barcodes": {},
    "barcode_wells": {},
    "barcode_serum_replicates": {},
    "serum_replicates": {},
}

assert set(manual_drops).issubset(
    qc_drops
), f"{manual_drops.keys()=}, {qc_drops.keys()}"

Statistics on barcode-parsing for each sample¶

Make interactive chart of the "fates" of the sequencing reads parsed for each sample on the plate.

If most sequencing reads are not "valid barcodes", this could potentially indicate some problem in the sequencing or barcode set you are parsing.

Potential fates are:

  • valid barcode: barcode that matches a known virus or neutralization standard, we hope most reads are this.
  • invalid barcode: a barcode with proper flanking sequences, but does not match a known virus or neutralization standard. If you have a lot of reads of this type, it is probably a good idea to look at the invalid barcode CSVs (in the ./results/barcode_invalid/ subdirectory created by the pipeline) to see what these invalid barcodes are.
  • unparseable barcode: could not parse a barcode from this read as there was not a sequence of the correct length with the appropriate flanking sequence.
  • invalid outer flank: if using an outer upstream or downstream region (upstream2 or downstream2 for the illuminabarcodeparser), reads that are otherwise valid except for this outer flank. Typically you would be using upstream2 if you have a plate index embedded in your primer, and reads with this classification correspond to a different index than the one for this plate.
  • low quality barcode: low-quality or N nucleotides in barcode, could indicate problem with sequencing.
  • failed chastity filter: reads that failed the Illumina chastity filter, if these are reported in the FASTQ (they may not be).

Also, if the number of reads per sample is very uneven, that could indicate that you did not do a good job of balancing the different samples in the Illumina sequencing.

In [5]:
fates = (
    pd.concat([pd.read_csv(f).assign(sample=s) for f, s in zip(fate_csvs, samples)])
    .merge(samples_df, validate="many_to_one", on="sample")
    .assign(
        fate_counts=lambda x: x.groupby("fate")["count"].transform("sum"),
        sample_well=lambda x: x["sample_noplate"] + " (" + x["well"] + ")",
    )
    .query("fate_counts > 0")[  # only keep fates with at least one count
        ["fate", "count", "well", "serum_replicate", "sample_well", "dilution_factor"]
    ]
)

assert len(fates) == len(fates.drop_duplicates())

serum_replicates = sorted(fates["serum_replicate"].unique())
sample_wells = list(
    fates.sort_values(["serum_replicate", "dilution_factor"])["sample_well"]
)


serum_selection = alt.selection_point(
    fields=["serum_replicate"],
    bind=alt.binding_select(
        options=[None] + serum_replicates,
        labels=["all"] + serum_replicates,
        name="serum",
    ),
)

fates_chart = (
    alt.Chart(fates)
    .add_params(serum_selection)
    .transform_filter(serum_selection)
    .encode(
        alt.X("count", scale=alt.Scale(nice=False, padding=3)),
        alt.Y(
            "sample_well",
            title=None,
            sort=sample_wells,
        ),
        alt.Color("fate", sort=sorted(fates["fate"].unique(), reverse=True)),
        alt.Order("fate", sort="descending"),
        tooltip=fates.columns.tolist(),
    )
    .mark_bar(height={"band": 0.85})
    .properties(
        height=alt.Step(10),
        width=200,
        title=f"Barcode parsing for {plate}",
    )
    .configure_axis(grid=False)
)

fates_chart
Out[5]:

Read barcode counts and apply manually specified drops¶

Read the counts per barcode:

In [6]:
# get barcode counts
counts = (
    pd.concat([pd.read_csv(c).assign(sample=s) for c, s in zip(count_csvs, samples)])
    .merge(samples_df, validate="many_to_one", on="sample")
    .drop(columns=["replicate", "plate", "fastq"])
    .assign(sample_well=lambda x: x["sample_noplate"] + " (" + x["well"] + ")")
)

# classify barcodes as viral or neut standard
barcode_class = pd.concat(
    [
        pd.read_csv(viral_library_csv)[["barcode", "strain"]].assign(
            neut_standard=False,
        ),
        pd.read_csv(neut_standard_set_csv)[["barcode"]].assign(
            neut_standard=True,
            strain=pd.NA,
        ),
    ],
    ignore_index=True,
)

# merge counts and classification of barcodes
assert set(counts["barcode"]) == set(barcode_class["barcode"])
counts = counts.merge(barcode_class, on="barcode", validate="many_to_one")
assert set(sample_wells) == set(counts["sample_well"])
assert set(serum_replicates) == set(counts["serum_replicate"])

Apply any manually specified data drops:

In [7]:
for filter_type, filter_drops in manual_drops.items():
    print(f"\nDropping {len(filter_drops)} {filter_type} specified in manual_drops")
    assert filter_type in qc_drops
    qc_drops[filter_type].update(
        {w: "manual_drop" for w in filter_drops if not isinstance(w, list)}
    )
    if filter_type == "barcode_wells":
        counts = counts[
            ~counts.assign(
                barcode_well=lambda x: x.apply(
                    lambda r: (r["barcode"], r["well"]), axis=1
                )
            )["barcode_well"].isin(qc_drops[filter_type])
        ]
    elif filter_type == "barcode_serum_replicates":
        counts = counts[
            ~counts.assign(
                barcode_serum_replicate=lambda x: x.apply(
                    lambda r: (r["barcode"], r["serum_replicate"]), axis=1
                )
            )["barcode_serum_replicate"].isin(qc_drops[filter_type])
        ]
    else:
        assert filter_type in set(counts.columns)
        counts = counts[~counts[filter_type].isin(qc_drops[filter_type])]

Average counts per barcode in each well¶

Plot average counts per barcode. If a sample has inadequate barcode counts, it may not have good enough statistics for accurate analysis, and a QC-threshold is applied:

In [8]:
avg_barcode_counts = (
    counts.groupby(
        ["well", "serum_replicate", "sample_well"],
        dropna=False,
        as_index=False,
    )
    .aggregate(avg_count=pd.NamedAgg("count", "mean"))
    .assign(
        fails_qc=lambda x: (
            x["avg_count"] < qc_thresholds["avg_barcode_counts_per_well"]
        ),
    )
)

avg_barcode_counts_chart = (
    alt.Chart(avg_barcode_counts)
    .add_params(serum_selection)
    .transform_filter(serum_selection)
    .encode(
        alt.X(
            "avg_count",
            title="average barcode counts per well",
            scale=alt.Scale(nice=False, padding=3),
        ),
        alt.Y("sample_well", sort=sample_wells),
        alt.Color(
            "fails_qc",
            title=f"fails {qc_thresholds['avg_barcode_counts_per_well']=}",
            legend=alt.Legend(titleLimit=500),
        ),
        tooltip=[
            alt.Tooltip(c, format=".3g") if avg_barcode_counts[c].dtype == float else c
            for c in avg_barcode_counts.columns
        ],
    )
    .mark_bar(height={"band": 0.85})
    .properties(
        height=alt.Step(10),
        width=250,
        title=f"Average barcode counts per well for {plate}",
    )
    .configure_axis(grid=False)
)

display(avg_barcode_counts_chart)

# drop wells failing QC
avg_barcode_counts_per_well_drops = list(avg_barcode_counts.query("fails_qc")["well"])
print(
    f"\nDropping {len(avg_barcode_counts_per_well_drops)} wells for failing "
    f"{qc_thresholds['avg_barcode_counts_per_well']=}: "
    + str(avg_barcode_counts_per_well_drops)
)
qc_drops["wells"].update(
    {w: "avg_barcode_counts_per_well" for w in avg_barcode_counts_per_well_drops}
)
counts = counts[~counts["well"].isin(qc_drops["wells"])]
Dropping 0 wells for failing qc_thresholds['avg_barcode_counts_per_well']=250: []

Fraction of counts from neutralization standard¶

Determine the fraction of counts from the neutralization standard in each sample, and make sure this fraction passess the QC threshold.

In [9]:
neut_standard_fracs = (
    counts.assign(
        neut_standard_count=lambda x: x["count"] * x["neut_standard"].astype(int)
    )
    .groupby(
        ["well", "serum_replicate", "sample_well"],
        dropna=False,
        as_index=False,
    )
    .aggregate(
        total_count=pd.NamedAgg("count", "sum"),
        neut_standard_count=pd.NamedAgg("neut_standard_count", "sum"),
    )
    .assign(
        neut_standard_frac=lambda x: x["neut_standard_count"] / x["total_count"],
        fails_qc=lambda x: (
            x["neut_standard_frac"] < qc_thresholds["min_neut_standard_frac_per_well"]
        ),
    )
)

neut_standard_fracs_chart = (
    alt.Chart(neut_standard_fracs)
    .add_params(serum_selection)
    .transform_filter(serum_selection)
    .encode(
        alt.X(
            "neut_standard_frac",
            title="frac counts from neutralization standard per well",
            scale=alt.Scale(nice=False, padding=3),
        ),
        alt.Y("sample_well", sort=sample_wells),
        alt.Color(
            "fails_qc",
            title=f"fails {qc_thresholds['min_neut_standard_frac_per_well']=}",
            legend=alt.Legend(titleLimit=500),
        ),
        tooltip=[
            alt.Tooltip(c, format=".3g") if neut_standard_fracs[c].dtype == float else c
            for c in neut_standard_fracs.columns
        ],
    )
    .mark_bar(height={"band": 0.85})
    .properties(
        height=alt.Step(10),
        width=250,
        title=f"Neutralization-standard fracs per well for {plate}",
    )
    .configure_axis(grid=False)
    .configure_legend(titleLimit=1000)
)

display(neut_standard_fracs_chart)

# drop wells failing QC
min_neut_standard_frac_per_well_drops = list(
    neut_standard_fracs.query("fails_qc")["well"]
)
print(
    f"\nDropping {len(min_neut_standard_frac_per_well_drops)} wells for failing "
    f"{qc_thresholds['min_neut_standard_frac_per_well']=}: "
    + str(min_neut_standard_frac_per_well_drops)
)
qc_drops["wells"].update(
    {
        w: "min_neut_standard_frac_per_well"
        for w in min_neut_standard_frac_per_well_drops
    }
)
counts = counts[~counts["well"].isin(qc_drops["wells"])]
Dropping 0 wells for failing qc_thresholds['min_neut_standard_frac_per_well']=0.005: []

Consistency and minimum fractions for barcodes¶

We examine the fraction of counts attributable to each barcode. We do this splitting the data two ways:

  1. Looking at all viral (but not neut-standard) barcodes only for the no-serum samples (wells).

  2. Looking at just the neut-standard barcodes for all samples (wells).

The reasons is that if the experiment is set up perfectly, these fractions should be the same across all samples for each barcode. (We do not expect viral barcodes to have consistent fractions across no-serum samples as they will be neutralized differently depending on strain).

We plot these fractions in interactive plots (you can mouseover points and zoom) so you can identify barcodes that fail the expected consistency QC thresholds.

We also make sure the barcodes meet specified QC minimum thresholds for all samples, and flag any that do not.

In [10]:
barcode_selection = alt.selection_point(fields=["barcode"], on="mouseover", empty=False)

# look at all samples for neut standard barcodes, or no-serum samples for all barcodes
for is_neut_standard, df in counts.groupby("neut_standard"):
    if is_neut_standard:
        print(
            f"\n\n{'=' * 89}\nAnalyzing neut-standard barcodes from all samples (wells)"
        )
        qc_name = "per_neut_standard_barcode_filters"
    else:
        print(f"\n\n{'=' * 89}\nAnalyzing all barcodes from no-serum samples (wells)")
        qc_name = "no_serum_per_viral_barcode_filters"
        df = df.query("serum == 'none'")

    df = df.assign(
        sample_counts=lambda x: x.groupby("sample")["count"].transform("sum"),
        count_frac=lambda x: x["count"] / x["sample_counts"],
        median_count_frac=lambda x: x.groupby("barcode")["count_frac"].transform(
            "median"
        ),
        fold_change_from_median=lambda x: numpy.where(
            x["count_frac"] > x["median_count_frac"],
            x["count_frac"] / x["median_count_frac"],
            x["median_count_frac"] / x["count_frac"],
        ),
    )[
        [
            "barcode",
            "count",
            "well",
            "sample_well",
            "count_frac",
            "median_count_frac",
            "fold_change_from_median",
        ]
        + ([] if is_neut_standard else ["strain"])
    ]

    # barcode fails QC if fails in sufficient wells
    qc = qc_thresholds[qc_name]
    print(f"Apply QC {qc_name}: {qc}\n")
    fails_qc = (
        df.assign(
            fails_qc=lambda x: ~(
                (x["count_frac"] >= qc["min_frac"])
                & (x["fold_change_from_median"] <= qc["max_fold_change"])
            ),
        )
        .groupby("barcode", as_index=False)
        .aggregate(n_wells_fail_qc=pd.NamedAgg("fails_qc", "sum"))
        .assign(fails_qc=lambda x: x["n_wells_fail_qc"] >= qc["max_wells"])[
            ["barcode", "fails_qc"]
        ]
    )
    df = df.merge(fails_qc, on="barcode", validate="many_to_one")

    # make chart
    evenness_chart = (
        alt.Chart(df)
        .add_params(barcode_selection)
        .encode(
            alt.X(
                "count_frac",
                title=(
                    "barcode's fraction of neut standard counts"
                    if is_neut_standard
                    else "barcode's fraction of non-neut standard counts"
                ),
                scale=alt.Scale(nice=False, padding=5),
            ),
            alt.Y("sample_well", sort=sample_wells),
            alt.Fill(
                "fails_qc",
                title=f"fails {qc_name}",
                legend=alt.Legend(titleLimit=500),
            ),
            strokeWidth=alt.condition(barcode_selection, alt.value(2), alt.value(0)),
            size=alt.condition(barcode_selection, alt.value(60), alt.value(35)),
            tooltip=[
                alt.Tooltip(c, format=".2g") if df[c].dtype == float else c
                for c in df.columns
            ],
        )
        .mark_circle(fillOpacity=0.45, stroke="black", strokeOpacity=1)
        .properties(
            height=alt.Step(10),
            width=300,
            title=alt.TitleParams(
                (
                    f"{plate} all samples, neut-standard barcodes"
                    if is_neut_standard
                    else f"{plate} no-serum samples, all barcodes"
                ),
                subtitle="x-axis is zoomable (use mouse scroll/pan)",
            ),
        )
        .configure_axis(grid=False)
        .configure_legend(titleLimit=1000)
        .interactive()
    )

    display(evenness_chart)

    # drop barcodes failing QC
    barcode_drops = list(fails_qc.query("fails_qc")["barcode"])
    print(
        f"\nDropping {len(barcode_drops)} barcodes for failing {qc=}: {barcode_drops}"
    )
    qc_drops["barcodes"].update(
        {bc: "min_neut_standard_frac_per_well" for bc in barcode_drops}
    )
    counts = counts[~counts["barcode"].isin(qc_drops["barcodes"])]

=========================================================================================
Analyzing all barcodes from no-serum samples (wells)
Apply QC no_serum_per_viral_barcode_filters: {'min_frac': 0.0005, 'max_fold_change': 3, 'max_wells': 2}

Dropping 24 barcodes for failing qc={'min_frac': 0.0005, 'max_fold_change': 3, 'max_wells': 2}: ['AAAGTAGCAGAGGATT', 'AATGACAGCTGTCTAG', 'AATGGTCGAGCCATTC', 'ACGCAAATAGACCGAA', 'AGACCATCGCACCCAA', 'ATAGAAAATTATCCGC', 'CAAAAGCAGCACGATA', 'CAATTCGCCGTTCCCC', 'CCAATCCCAGCCTTTA', 'CCCGCTAACCCTGTCT', 'CCCTCCTCAAGGGTAA', 'CCGCAATGACAATTTG', 'CGTACGTATGTCCCAG', 'CGTCCCTGGCGTGTCG', 'CGTTAACGGCCTATCC', 'CTCATTACAGAAATTG', 'CTCCAATAGGAGACGA', 'CTTCATCTCATTTAAA', 'TATACTCACGGAGGAT', 'TATATGGAATACTAAA', 'TCCCCGTGGTTTGACA', 'TCCGCCACTATAACAT', 'TCTCCGATAGCCCTAC', 'TGACAAACACCTGAGG']


=========================================================================================
Analyzing neut-standard barcodes from all samples (wells)
Apply QC per_neut_standard_barcode_filters: {'min_frac': 0.005, 'max_fold_change': 3, 'max_wells': 2}

Dropping 0 barcodes for failing qc={'min_frac': 0.005, 'max_fold_change': 3, 'max_wells': 2}: []

Compute fraction infectivity¶

The fraction infectivity for viral barcode $v_b$ in sample $s$ is computed as: $$ F_{v_b,s} = \frac{c_{v_b,s} / \left(\sum_{n_b} c_{n_b,s}\right)}{{\rm median}_{s_0}\left[ c_{v_b,s_0} / \left(\sum_{n_b} c_{n_b,s_0}\right)\right]} $$ where

  • $c_{v_b,s}$ is the counts of viral barcode $v_b$ in sample $s$.
  • $\sum_{n_b} c_{n_b,s}$ is the sum of the counts for all neutralization standard barcodes $n_b$ for sample $s$.
  • $c_{v_b,s_0}$ is the counts of viral barcode $v_b$ in no-serum sample $s_0$.
  • $\sum_{n_b} c_{n_b,s_0}$ is the sum of the counts for all neutralization standard barcodes $n_b$ for no-serum sample $s_0$.
  • ${\rm median}_{s_0}\left[ c_{v_b,s_0} / \left(\sum_{n_b} c_{n_b,s_0}\right)\right]$ is the median taken across all no-serum samples of the counts of viral barcode $v_b$ versus the total counts for all neutralization standard barcodes.

First, compute the total neutralization-standard counts for each sample (well). Plot these, and drop any wells that do not meet the QC threshold.

In [11]:
neut_standard_counts = (
    counts.query("neut_standard")
    .groupby(
        ["well", "serum_replicate", "sample_well", "dilution_factor"],
        dropna=False,
        as_index=False,
    )
    .aggregate(neut_standard_count=pd.NamedAgg("count", "sum"))
    .assign(
        fails_qc=lambda x: (
            x["neut_standard_count"] < qc_thresholds["min_neut_standard_count_per_well"]
        ),
    )
)

neut_standard_counts_chart = (
    alt.Chart(neut_standard_counts)
    .add_params(serum_selection)
    .transform_filter(serum_selection)
    .encode(
        alt.X(
            "neut_standard_count",
            title="counts from neutralization standard",
            scale=alt.Scale(nice=False, padding=3),
        ),
        alt.Y("sample_well", sort=sample_wells),
        alt.Color(
            "fails_qc",
            title=f"fails {qc_thresholds['min_neut_standard_count_per_well']=}",
            legend=alt.Legend(titleLimit=500),
        ),
        tooltip=[
            (
                alt.Tooltip(c, format=".3g")
                if neut_standard_counts[c].dtype == float
                else c
            )
            for c in neut_standard_counts.columns
        ],
    )
    .mark_bar(height={"band": 0.85})
    .properties(
        height=alt.Step(10),
        width=250,
        title=f"Neutralization-standard counts for {plate}",
    )
    .configure_axis(grid=False)
    .configure_legend(titleLimit=1000)
)

display(neut_standard_counts_chart)

# drop wells failing QC
min_neut_standard_count_per_well_drops = list(
    neut_standard_counts.query("fails_qc")["well"]
)
print(
    f"\nDropping {len(min_neut_standard_count_per_well_drops)} wells for failing "
    f"{qc_thresholds['min_neut_standard_count_per_well']=}: "
    + str(min_neut_standard_count_per_well_drops)
)
qc_drops["wells"].update(
    {
        w: "min_neut_standard_count_per_well"
        for w in min_neut_standard_count_per_well_drops
    }
)
neut_standard_counts = neut_standard_counts[
    ~neut_standard_counts["well"].isin(qc_drops["wells"])
]
counts = counts[~counts["well"].isin(qc_drops["wells"])]
Dropping 0 wells for failing qc_thresholds['min_neut_standard_count_per_well']=500: []

Compute and plot the no-serum sample viral barcode counts and check if they pass the QC filters.

In [12]:
no_serum_counts = (
    counts.query("serum == 'none'")
    .query("not neut_standard")
    .merge(neut_standard_counts, validate="many_to_one")[
        ["barcode", "strain", "well", "sample_well", "count", "neut_standard_count"]
    ]
    .assign(
        fails_qc=lambda x: (
            x["count"] <= qc_thresholds["min_no_serum_count_per_viral_barcode_well"]
        ),
    )
)

strains = sorted(no_serum_counts["strain"].unique())
strain_selection_dropdown = alt.selection_point(
    fields=["strain"],
    bind=alt.binding_select(
        options=[None] + strains,
        labels=["all"] + strains,
        name="virus strain",
    ),
)

# make chart
no_serum_counts_chart = (
    alt.Chart(no_serum_counts)
    .add_params(barcode_selection, strain_selection_dropdown)
    .transform_filter(strain_selection_dropdown)
    .encode(
        alt.X(
            "count", title="viral barcode count", scale=alt.Scale(nice=False, padding=5)
        ),
        alt.Y("sample_well", sort=sample_wells),
        alt.Fill(
            "fails_qc",
            title=f"fails {qc_thresholds['min_no_serum_count_per_viral_barcode_well']=}",
            legend=alt.Legend(titleLimit=500),
        ),
        strokeWidth=alt.condition(barcode_selection, alt.value(2), alt.value(0)),
        size=alt.condition(barcode_selection, alt.value(60), alt.value(35)),
        tooltip=no_serum_counts.columns.tolist(),
    )
    .mark_circle(fillOpacity=0.6, stroke="black", strokeOpacity=1)
    .properties(
        height=alt.Step(10),
        width=400,
        title=f"{plate} viral barcode counts in no-serum samples",
    )
    .configure_axis(grid=False)
    .configure_legend(titleLimit=1000)
    .interactive()
)

display(no_serum_counts_chart)

# drop barcode / wells failing QC
min_no_serum_count_per_viral_barcode_well_drops = list(
    no_serum_counts.query("fails_qc")[["barcode", "well"]].itertuples(
        index=False, name=None
    )
)
print(
    f"\nDropping {len(min_no_serum_count_per_viral_barcode_well_drops)} barcode-wells for failing "
    f"{qc_thresholds['min_no_serum_count_per_viral_barcode_well']=}: "
    + str(min_no_serum_count_per_viral_barcode_well_drops)
)
qc_drops["barcode_wells"].update(
    {
        w: "min_no_serum_count_per_viral_barcode_well"
        for w in min_no_serum_count_per_viral_barcode_well_drops
    }
)
no_serum_counts = no_serum_counts[
    ~no_serum_counts.assign(
        barcode_well=lambda x: x.apply(lambda r: (r["barcode"], r["well"]), axis=1)
    )["barcode_well"].isin(qc_drops["barcode_wells"])
]
counts = counts[
    ~counts.assign(
        barcode_well=lambda x: x.apply(lambda r: (r["barcode"], r["well"]), axis=1)
    )["barcode_well"].isin(qc_drops["barcode_wells"])
]
Dropping 4 barcode-wells for failing qc_thresholds['min_no_serum_count_per_viral_barcode_well']=30: [('CCTATAAGGCCTTACG', 'A10'), ('GATTCACGGCCCACAA', 'B10'), ('CACCGACCAACTCTCT', 'E10'), ('GACCAAAGCTGCAGGG', 'E10')]

Compute and plot the median ratio of viral barcode count to neut standard counts across no-serum samples. If library composition is equal, all of these values should be similar:

In [13]:
median_no_serum_ratio = (
    no_serum_counts.assign(ratio=lambda x: x["count"] / x["neut_standard_count"])
    .groupby(["barcode", "strain"], as_index=False)
    .aggregate(median_no_serum_ratio=pd.NamedAgg("ratio", "median"))
)

strain_selection = alt.selection_point(fields=["strain"], on="mouseover", empty=False)

median_no_serum_ratio_chart = (
    alt.Chart(median_no_serum_ratio)
    .add_params(strain_selection)
    .encode(
        alt.X(
            "median_no_serum_ratio",
            title="median ratio of counts",
            scale=alt.Scale(nice=False, padding=5),
        ),
        alt.Y(
            "barcode",
            sort=alt.SortField("median_no_serum_ratio", order="descending"),
            axis=alt.Axis(labelFontSize=5),
        ),
        color=alt.condition(strain_selection, alt.value("orange"), alt.value("gray")),
        tooltip=[
            (
                alt.Tooltip(c, format=".3g")
                if median_no_serum_ratio[c].dtype == float
                else c
            )
            for c in median_no_serum_ratio.columns
        ],
    )
    .mark_bar(height={"band": 0.85})
    .properties(
        height=alt.Step(5),
        width=250,
        title=f"{plate} no-serum median ratio viral barcode to neut-standard barcode",
    )
    .configure_axis(grid=False)
    .configure_legend(titleLimit=1000)
)

display(median_no_serum_ratio_chart)

Compute the actual fraction infectivities. We compute both the raw fraction infectivities and the ones with the ceiling applied:

In [14]:
frac_infectivity = (
    counts.query("not neut_standard")
    .query("serum != 'none'")
    .merge(median_no_serum_ratio, validate="many_to_one")
    .merge(neut_standard_counts, validate="many_to_one")
    .assign(
        frac_infectivity_raw=lambda x: (
            (x["count"] / x["neut_standard_count"]) / x["median_no_serum_ratio"]
        ),
        frac_infectivity_ceiling=lambda x: x["frac_infectivity_raw"].clip(
            upper=curvefit_params["frac_infectivity_ceiling"]
        ),
        concentration=lambda x: 1 / x["dilution_factor"],
        plate_barcode=lambda x: x["plate_replicate"] + "-" + x["barcode"],
    )[
        [
            "barcode",
            "plate_barcode",
            "well",
            "strain",
            "serum",
            "serum_replicate",
            "dilution_factor",
            "concentration",
            "frac_infectivity_raw",
            "frac_infectivity_ceiling",
        ]
    ]
)

assert len(
    frac_infectivity.groupby(["serum", "plate_barcode", "dilution_factor"])
) == len(frac_infectivity)
assert frac_infectivity["dilution_factor"].notnull().all()
assert frac_infectivity["frac_infectivity_raw"].notnull().all()
assert frac_infectivity["frac_infectivity_ceiling"].notnull().all()

Plot the fraction infectivities, both the raw values and with the ceiling applied:

In [15]:
frac_infectivity_chart_df = (
    frac_infectivity.assign(
        fails_qc=lambda x: (
            x["frac_infectivity_raw"]
            > qc_thresholds["max_frac_infectivity_per_viral_barcode_well"]
        ),
    )
    .melt(
        id_vars=[
            "barcode",
            "strain",
            "well",
            "serum_replicate",
            "dilution_factor",
            "fails_qc",
        ],
        value_vars=["frac_infectivity_raw", "frac_infectivity_ceiling"],
        var_name="ceiling_applied",
        value_name="frac_infectivity",
    )
    .assign(
        ceiling_applied=lambda x: x["ceiling_applied"].map(
            {
                "frac_infectivity_raw": "raw fraction infectivity",
                "frac_infectivity_ceiling": f"fraction infectivity with ceiling at {curvefit_params['frac_infectivity_ceiling']}",
            }
        )
    )
)

frac_infectivity_chart = (
    alt.Chart(frac_infectivity_chart_df)
    .add_params(strain_selection_dropdown, barcode_selection)
    .transform_filter(strain_selection_dropdown)
    .encode(
        alt.X(
            "dilution_factor",
            title="dilution factor",
            scale=alt.Scale(nice=False, padding=5, type="log"),
        ),
        alt.Y(
            "frac_infectivity",
            title="fraction infectivity",
            scale=alt.Scale(nice=False, padding=5),
        ),
        alt.Column(
            "ceiling_applied",
            sort="descending",
            title=None,
            header=alt.Header(labelFontSize=13, labelFontStyle="bold", labelPadding=2),
        ),
        alt.Row(
            "serum_replicate",
            title=None,
            spacing=3,
            header=alt.Header(labelFontSize=13, labelFontStyle="bold"),
        ),
        alt.Detail("barcode"),
        alt.Shape(
            "fails_qc",
            title=f"fails {qc_thresholds['max_frac_infectivity_per_viral_barcode_well']=}",
            legend=alt.Legend(titleLimit=500, orient="bottom"),
        ),
        color=alt.condition(
            barcode_selection, alt.value("black"), alt.value("MediumBlue")
        ),
        strokeWidth=alt.condition(barcode_selection, alt.value(3), alt.value(1)),
        opacity=alt.condition(barcode_selection, alt.value(1), alt.value(0.25)),
        tooltip=[
            (
                alt.Tooltip(c, format=".3g")
                if frac_infectivity_chart_df[c].dtype == float
                else c
            )
            for c in frac_infectivity_chart_df.columns
        ],
    )
    .mark_line(point=True)
    .properties(
        height=150,
        width=250,
        title=f"Fraction infectivities for {plate}",
    )
    .interactive(bind_x=False)
    .configure_axis(grid=False)
    .configure_legend(titleLimit=1000)
    .configure_point(size=50)
    .resolve_scale(x="independent", y="independent")
)

display(frac_infectivity_chart)

# drop barcode / wells failing QC
max_frac_infectivity_per_viral_barcode_well_drops = list(
    frac_infectivity_chart_df.query("fails_qc")[["barcode", "well"]]
    .drop_duplicates()
    .itertuples(index=False, name=None)
)
print(
    f"\nDropping {len(max_frac_infectivity_per_viral_barcode_well_drops)} barcode-wells for failing "
    f"{qc_thresholds['max_frac_infectivity_per_viral_barcode_well']=}: "
    + str(max_frac_infectivity_per_viral_barcode_well_drops)
)
qc_drops["barcode_wells"].update(
    {
        w: "max_frac_infectivity_per_viral_barcode_well"
        for w in max_frac_infectivity_per_viral_barcode_well_drops
    }
)
frac_infectivity = frac_infectivity[
    ~frac_infectivity.assign(
        barcode_well=lambda x: x.apply(lambda r: (r["barcode"], r["well"]), axis=1)
    )["barcode_well"].isin(qc_drops["barcode_wells"])
]
Dropping 0 barcode-wells for failing qc_thresholds['max_frac_infectivity_per_viral_barcode_well']=5: []

Check how many dilutions we have per barcode / serum-replicate:

In [16]:
n_dilutions = (
    frac_infectivity.groupby(["serum_replicate", "strain", "barcode"], as_index=False)
    .aggregate(**{"number of dilutions": pd.NamedAgg("dilution_factor", "nunique")})
    .assign(
        fails_qc=lambda x: (
            x["number of dilutions"]
            < qc_thresholds["min_dilutions_per_barcode_serum_replicate"]
        ),
    )
)

n_dilutions_chart = (
    alt.Chart(n_dilutions)
    .add_params(barcode_selection)
    .encode(
        alt.X("number of dilutions", scale=alt.Scale(nice=False, padding=4)),
        alt.Y("strain", title=None),
        alt.Column(
            "serum_replicate",
            title=None,
            header=alt.Header(labelFontSize=12, labelFontStyle="bold", labelPadding=0),
        ),
        alt.Fill(
            "fails_qc",
            title=f"fails {qc_thresholds['min_dilutions_per_barcode_serum_replicate']=}",
            legend=alt.Legend(titleLimit=500, orient="bottom"),
        ),
        strokeWidth=alt.condition(barcode_selection, alt.value(2), alt.value(0)),
        size=alt.condition(barcode_selection, alt.value(55), alt.value(35)),
        tooltip=[
            alt.Tooltip(c, format=".3g") if n_dilutions[c].dtype == float else c
            for c in n_dilutions.columns
        ],
    )
    .mark_circle(stroke="black", strokeOpacity=1, fillOpacity=0.45)
    .properties(
        height=alt.Step(10),
        width=120,
        title=alt.TitleParams(
            "number of dilutions for each barcode for each serum-replicate", dy=-2
        ),
    )
)

display(n_dilutions_chart)

# drop barcode / serum-replicates failing QC
min_dilutions_per_barcode_serum_replicate_drops = list(
    n_dilutions.query("fails_qc")[["barcode", "serum_replicate"]].itertuples(
        index=False, name=None
    )
)
print(
    f"\nDropping {len(min_dilutions_per_barcode_serum_replicate_drops)} barcode/serum-replicates for failing "
    f"{qc_thresholds['min_dilutions_per_barcode_serum_replicate']=}: "
    + str(min_dilutions_per_barcode_serum_replicate_drops)
)
qc_drops["barcode_serum_replicates"].update(
    {
        w: "min_dilutions_per_barcode_serum_replicate"
        for w in min_dilutions_per_barcode_serum_replicate_drops
    }
)
frac_infectivity = frac_infectivity[
    ~frac_infectivity.assign(
        barcode_serum_replicate=lambda x: x.apply(
            lambda r: (r["barcode"], r["serum_replicate"]), axis=1
        )
    )["barcode_serum_replicate"].isin(qc_drops["barcode_serum_replicates"])
]
Dropping 0 barcode/serum-replicates for failing qc_thresholds['min_dilutions_per_barcode_serum_replicate']=6: []

Fit neutralization curves without applying QC to curves¶

First fit curves to all serum replicates, then we will apply QC on the curve fits. Note that the fitting is done to the fraction infectivities with the ceiling:

In [17]:
fits_noqc = neutcurve.CurveFits(
    frac_infectivity.rename(
        columns={
            "frac_infectivity_ceiling": "fraction infectivity",
            "concentration": "serum concentration",
        }
    ),
    conc_col="serum concentration",
    fracinf_col="fraction infectivity",
    virus_col="strain",
    serum_col="serum_replicate",
    replicate_col="barcode",
    fixtop=curvefit_params["fixtop"],
    fixbottom=curvefit_params["fixbottom"],
    fixslope=curvefit_params["fixslope"],
)

Determine which fits fail the curve fitting QC, and plot them. Note the plot indicates as failing QC any barcode / serum-replicate that fails, even if we are also specified to ignore the QC for that one (so it will not be removed later):

In [18]:
goodness_of_fit = curvefit_qc["goodness_of_fit"]

fit_params_noqc = (
    frac_infectivity.groupby(["serum_replicate", "barcode"], as_index=False)
    .aggregate(max_frac_infectivity=pd.NamedAgg("frac_infectivity_ceiling", "max"))
    .merge(
        fits_noqc.fitParams(average_only=False, no_average=True)[
            ["serum", "virus", "replicate", "r2", "rmsd"]
        ].rename(columns={"serum": "serum_replicate", "replicate": "barcode"}),
        validate="one_to_one",
    )
    .assign(
        fails_max_frac_infectivity_at_least=lambda x: (
            x["max_frac_infectivity"] < curvefit_qc["max_frac_infectivity_at_least"]
        ),
        fails_goodness_of_fit=lambda x: (
            (x["r2"] < goodness_of_fit["min_R2"])
            & (x["rmsd"] > goodness_of_fit["max_RMSD"])
        ),
        fails_qc=lambda x: (
            x["fails_max_frac_infectivity_at_least"] | x["fails_goodness_of_fit"]
        ),
        ignore_qc=lambda x: x.apply(
            lambda r: (
                (
                    r["serum_replicate"]
                    in curvefit_qc["serum_replicates_ignore_curvefit_qc"]
                )
                or (
                    (r["barcode"], r["serum_replicate"])
                    in curvefit_qc["barcode_serum_replicates_ignore_curvefit_qc"]
                )
            ),
            axis=1,
        ),
    )
)

print(f"Plotting barcode / serum-replicates that fail {curvefit_qc=}\n")

for prop, col in [
    ("max frac infectivity", "max_frac_infectivity"),
    ("curve fit R2", "r2"),
    ("curve fit RMSD", "rmsd"),
]:
    fit_params_noqc_chart = (
        alt.Chart(fit_params_noqc)
        .add_params(barcode_selection)
        .encode(
            alt.X(col, title=prop, scale=alt.Scale(nice=False, padding=4)),
            alt.Y("virus", title=None),
            alt.Fill("fails_qc"),
            alt.Column(
                "serum_replicate",
                title=None,
                header=alt.Header(
                    labelFontSize=12, labelFontStyle="bold", labelPadding=0
                ),
            ),
            strokeWidth=alt.condition(barcode_selection, alt.value(2), alt.value(0)),
            size=alt.condition(barcode_selection, alt.value(55), alt.value(35)),
            tooltip=[
                alt.Tooltip(c, format=".3g") if fit_params_noqc[c].dtype == float else c
                for c in fit_params_noqc.columns
            ],
        )
        .mark_circle(stroke="black", strokeOpacity=1, fillOpacity=0.55)
        .properties(
            height=alt.Step(10),
            width=120,
            title=alt.TitleParams(f"{prop} for each barcode serum-replicate", dy=-2),
        )
    )
    display(fit_params_noqc_chart)
Plotting barcode / serum-replicates that fail curvefit_qc={'max_frac_infectivity_at_least': 0, 'goodness_of_fit': {'min_R2': 0.7, 'max_RMSD': 0.1}, 'serum_replicates_ignore_curvefit_qc': [], 'barcode_serum_replicates_ignore_curvefit_qc': []}

Now get all barcode / serum-replicate pairs that fail any of the QC. Plot curves for just these virus / serum-replicates (we plot all barcodes for a virus even if just one fails QC), and then exclude any that are not specified to ignore the QC:

In [19]:
barcode_serum_replicates_fail_qc = fit_params_noqc.query("fails_qc").reset_index(
    drop=True
)
print(f"Here are barcode / serum-replicates that fail {curvefit_qc=}")
display(barcode_serum_replicates_fail_qc)

if len(barcode_serum_replicates_fail_qc):
    print("\nCurves for viruses and serum-replicates with at least one failed barcode:")
    fig, _ = fits_noqc.plotReplicates(
        sera=sorted(barcode_serum_replicates_fail_qc["serum_replicate"].unique()),
        viruses=sorted(barcode_serum_replicates_fail_qc["virus"].unique()),
        attempt_shared_legend=False,
        legendfontsize=8,
        titlesize=10,
        ticksize=10,
        ncol=6,
        draw_in_bounds=True,
    )
    display(fig)
    plt.close(fig)

# drop barcode / serum-replicates failing QC
for qc_filter in ["max_frac_infectivity_at_least", "goodness_of_fit"]:
    fits_qc_drops = list(
        fit_params_noqc.query(f"fails_{qc_filter} and (not ignore_qc)")[
            ["barcode", "serum_replicate"]
        ].itertuples(index=False, name=None)
    )
    print(
        f"\nDropping {len(fits_qc_drops)} barcode/serum-replicates for failing "
        f"{qc_filter}={curvefit_qc[qc_filter]}: " + str(fits_qc_drops)
    )
    qc_drops["barcode_serum_replicates"].update({w: qc_filter for w in fits_qc_drops})
    frac_infectivity = frac_infectivity[
        ~frac_infectivity.assign(
            barcode_serum_replicate=lambda x: x.apply(
                lambda r: (r["barcode"], r["serum_replicate"]), axis=1
            )
        )["barcode_serum_replicate"].isin(qc_drops["barcode_serum_replicates"])
    ]
    fit_params_noqc = fit_params_noqc[
        ~fit_params_noqc.assign(
            barcode_serum_replicate=lambda x: x.apply(
                lambda r: (r["barcode"], r["serum_replicate"]), axis=1
            )
        )["barcode_serum_replicate"].isin(qc_drops["barcode_serum_replicates"])
    ]
Here are barcode / serum-replicates that fail curvefit_qc={'max_frac_infectivity_at_least': 0, 'goodness_of_fit': {'min_R2': 0.7, 'max_RMSD': 0.1}, 'serum_replicates_ignore_curvefit_qc': [], 'barcode_serum_replicates_ignore_curvefit_qc': []}
serum_replicate barcode max_frac_infectivity virus r2 rmsd fails_max_frac_infectivity_at_least fails_goodness_of_fit fails_qc ignore_qc
0 A23038d0_r32_75K AAAGATAAATTCAAAA 1.000000 A/SENDAI/45/2023 0.617148 0.195367 False True True False
1 A23038d0_r32_75K AAATGCTGAGAGGGTA 0.862929 A/South_Africa/R07188/2023 0.551583 0.186557 False True True False
2 A23038d0_r32_75K AACGACACTTACATCC 1.000000 A/Netherlands/01693/2023 0.636100 0.180683 False True True False
3 A23038d0_r32_75K AATAGGCCCAAATCCA 1.000000 A/Oman/3011/2023 0.565405 0.190553 False True True False
4 A23038d0_r32_75K AATCTTTCCAATCTTG 1.000000 A/SouthSudan/642/2023 0.502680 0.222618 False True True False
5 A23038d0_r32_75K AATTACGCATAGGCCA 0.892113 A/Malaysia/IMR-SARI1989/2023 0.531560 0.203096 False True True False
6 A23038d0_r32_75K ACAAAGATAAAAATTT 1.000000 A/Guangdong-Futian/1980/2023 0.639012 0.184513 False True True False
7 A23038d0_r32_75K AGATCATAAGCAATAA 1.000000 A/Catalonia/2041146NS/2023 0.585573 0.192508 False True True False
8 A23038d0_r32_75K AGGGACTTTATTGTCC 1.000000 A/South_Dakota/22/2023 0.552145 0.236778 False True True False
9 A23038d0_r32_75K AGGTGCGAGCCATCAG 0.758526 A/Bangkok/P3599/2023 0.429296 0.187407 False True True False
10 A23038d0_r32_75K ATCGAAAAAACTGCAA 0.997784 A/South_Africa/K056872/2023 0.673572 0.157786 False True True False
11 A23038d0_r32_75K ATCGATTCGATTGACG 0.668517 A/Victoria/1033/2023 0.452359 0.170759 False True True False
12 A23038d0_r32_75K ATGGTTATCTTACCTT 1.000000 A/Luga/RII-11393S/2023 0.540049 0.243452 False True True False
13 A23038d0_r32_75K ATTAGATTATAACGTA 1.000000 A/Cambodia/e0826360/2020 0.569252 0.234189 False True True False
14 A23038d0_r32_75K CACCGACCAACTCTCT 0.668522 A/South_Africa/R06240/2023 -0.099671 0.203397 False True True False
15 A23038d0_r32_75K CACGTTAGTGAGACTT 0.943296 A/Romania/543634/2022 0.675439 0.168308 False True True False
16 A23038d0_r32_75K CATAAAAGACTGTATA 1.000000 A/Thailand/8/2022 0.613899 0.184037 False True True False
17 A23038d0_r32_75K CATAATGCACAAACGC 0.827674 A/South_Africa/R06359/2023 0.561621 0.175688 False True True False
18 A23038d0_r32_75K CCACAAGTTTGAAAAC 0.951468 A/Finland/399/2023 0.556492 0.184352 False True True False
19 A23038d0_r32_75K CCATCACCTTATACAC 0.927907 A/Netherlands/01693/2023 0.351597 0.222657 False True True False
20 A23038d0_r32_75K CCGATAAGACGTCGCT 0.926675 A/California/81/2023 0.685631 0.161735 False True True False
21 A23038d0_r32_75K CCTATAAGGCCTTACG 1.000000 A/Wisconsin/27/2023 0.311398 0.247445 False True True False
22 A23038d0_r32_75K CGCGAACAACAGGGGA 0.935992 A/Krabi/THIS050/2023 0.661590 0.172954 False True True False
23 A23038d0_r32_75K CGTCAGAAGTTTATAA 1.000000 A/Luga/RII-11393S/2023 0.443012 0.261734 False True True False
24 A23038d0_r32_75K CTATCTTAATCTACAG 1.000000 A/Darwin/6/2021 -0.350804 0.278777 False True True False
25 A23038d0_r32_75K CTGCGAATATTGTGAC 1.000000 A/SENDAI/45/2023 0.673053 0.203210 False True True False
26 A23038d0_r32_75K CTTGAATACACAAACA 1.000000 A/KANAGAWA/AC2316/2023 0.687681 0.177308 False True True False
27 A23038d0_r32_75K GAAGTAACAAACTATG 0.776149 A/South_Dakota/22/2023 0.665492 0.131256 False True True False
28 A23038d0_r32_75K GACCAAAAAGCAGTAT 0.731881 A/Saskatchewan/SKFLU317847/2023 0.694542 0.126576 False True True False
29 A23038d0_r32_75K GACCAAAGCTGCAGGG 1.000000 A/KANAGAWA/IC2239/2023 0.638333 0.181234 False True True False
30 A23038d0_r32_75K GATCTGCTTGGAATGT 0.604265 A/South_Africa/R06477/2023 0.609075 0.107394 False True True False
31 A23038d0_r32_75K GCATTATAATCTTGTG 1.000000 A/Busan/1301/2023 0.692280 0.183702 False True True False
32 A23038d0_r32_75K GCTGGTGCACAAGATT 1.000000 A/Abu_Dhabi/6753/2023 0.688429 0.179615 False True True False
33 A23038d0_r32_75K GTTATTATGACTTCAT 1.000000 A/Darwin/6/2021 0.542392 0.196614 False True True False
34 A23038d0_r32_75K TACGAAAATCAAGAGC 0.523915 A/Oman/3011/2023 0.592605 0.123531 False True True False
35 A23038d0_r32_75K TATCAATTCGGTATTA 1.000000 A/Finland/391/2023 0.576482 0.217750 False True True False
36 A23038d0_r32_75K TATCGCAATATGATAA 0.630187 A/Abu_Dhabi/6753/2023 0.460040 0.164871 False True True False
37 A23038d0_r32_75K TCAATGAATGCGGGGT 0.979394 A/Finland/399/2023 0.513232 0.137650 False True True False
38 A23038d0_r32_75K TCCACACCCCTAGCTA 1.000000 A/Massachusets/18/2022 0.306581 0.284225 False True True False
39 A23038d0_r32_75K TCTTAGAGTGAACGAT 0.938604 A/Hong_Kong/4801/2014 0.694397 0.168430 False True True False
40 A23038d0_r32_75K TGAGATCAGCCGGGTG 0.718790 A/Netherlands/01685/2023 0.696107 0.126304 False True True False
41 A23038d0_r32_75K TGCCGATCCAATTGAT 0.790778 A/KANAGAWA/AC2316/2023 0.699921 0.134019 False True True False
42 A23038d0_r32_75K TGTAGTATAAGAATAA 0.652219 A/KANAGAWA/IC2239/2023 0.692158 0.133256 False True True False
43 A23038d0_r32_75K TTAGCAGTTAACGTAT 0.861977 A/YAMAGATA/98/2023 0.611730 0.171924 False True True False
44 A23038d0_r32_75K TTATGTTTTAATGGTA 1.000000 A/Guangdong-Futian/1980/2023 0.289322 0.259267 False True True False
45 A23038d0_r32_75K TTGACTCACCGAATAA 1.000000 A/Cambodia/e0826360/2020 0.637421 0.176567 False True True False
Curves for viruses and serum-replicates with at least one failed barcode:
No description has been provided for this image
Dropping 0 barcode/serum-replicates for failing max_frac_infectivity_at_least=0: []

Dropping 46 barcode/serum-replicates for failing goodness_of_fit={'min_R2': 0.7, 'max_RMSD': 0.1}: [('AAAGATAAATTCAAAA', 'A23038d0_r32_75K'), ('AAATGCTGAGAGGGTA', 'A23038d0_r32_75K'), ('AACGACACTTACATCC', 'A23038d0_r32_75K'), ('AATAGGCCCAAATCCA', 'A23038d0_r32_75K'), ('AATCTTTCCAATCTTG', 'A23038d0_r32_75K'), ('AATTACGCATAGGCCA', 'A23038d0_r32_75K'), ('ACAAAGATAAAAATTT', 'A23038d0_r32_75K'), ('AGATCATAAGCAATAA', 'A23038d0_r32_75K'), ('AGGGACTTTATTGTCC', 'A23038d0_r32_75K'), ('AGGTGCGAGCCATCAG', 'A23038d0_r32_75K'), ('ATCGAAAAAACTGCAA', 'A23038d0_r32_75K'), ('ATCGATTCGATTGACG', 'A23038d0_r32_75K'), ('ATGGTTATCTTACCTT', 'A23038d0_r32_75K'), ('ATTAGATTATAACGTA', 'A23038d0_r32_75K'), ('CACCGACCAACTCTCT', 'A23038d0_r32_75K'), ('CACGTTAGTGAGACTT', 'A23038d0_r32_75K'), ('CATAAAAGACTGTATA', 'A23038d0_r32_75K'), ('CATAATGCACAAACGC', 'A23038d0_r32_75K'), ('CCACAAGTTTGAAAAC', 'A23038d0_r32_75K'), ('CCATCACCTTATACAC', 'A23038d0_r32_75K'), ('CCGATAAGACGTCGCT', 'A23038d0_r32_75K'), ('CCTATAAGGCCTTACG', 'A23038d0_r32_75K'), ('CGCGAACAACAGGGGA', 'A23038d0_r32_75K'), ('CGTCAGAAGTTTATAA', 'A23038d0_r32_75K'), ('CTATCTTAATCTACAG', 'A23038d0_r32_75K'), ('CTGCGAATATTGTGAC', 'A23038d0_r32_75K'), ('CTTGAATACACAAACA', 'A23038d0_r32_75K'), ('GAAGTAACAAACTATG', 'A23038d0_r32_75K'), ('GACCAAAAAGCAGTAT', 'A23038d0_r32_75K'), ('GACCAAAGCTGCAGGG', 'A23038d0_r32_75K'), ('GATCTGCTTGGAATGT', 'A23038d0_r32_75K'), ('GCATTATAATCTTGTG', 'A23038d0_r32_75K'), ('GCTGGTGCACAAGATT', 'A23038d0_r32_75K'), ('GTTATTATGACTTCAT', 'A23038d0_r32_75K'), ('TACGAAAATCAAGAGC', 'A23038d0_r32_75K'), ('TATCAATTCGGTATTA', 'A23038d0_r32_75K'), ('TATCGCAATATGATAA', 'A23038d0_r32_75K'), ('TCAATGAATGCGGGGT', 'A23038d0_r32_75K'), ('TCCACACCCCTAGCTA', 'A23038d0_r32_75K'), ('TCTTAGAGTGAACGAT', 'A23038d0_r32_75K'), ('TGAGATCAGCCGGGTG', 'A23038d0_r32_75K'), ('TGCCGATCCAATTGAT', 'A23038d0_r32_75K'), ('TGTAGTATAAGAATAA', 'A23038d0_r32_75K'), ('TTAGCAGTTAACGTAT', 'A23038d0_r32_75K'), ('TTATGTTTTAATGGTA', 'A23038d0_r32_75K'), ('TTGACTCACCGAATAA', 'A23038d0_r32_75K')]

Fit neutralization curves after applying QC¶

No we re-fit curves after applying all the QC:

In [20]:
fits_qc = neutcurve.CurveFits(
    frac_infectivity.rename(
        columns={
            "frac_infectivity_ceiling": "fraction infectivity",
            "concentration": "serum concentration",
        }
    ),
    conc_col="serum concentration",
    fracinf_col="fraction infectivity",
    virus_col="strain",
    serum_col="serum",
    replicate_col="plate_barcode",
    fixtop=curvefit_params["fixtop"],
    fixbottom=curvefit_params["fixbottom"],
    fixslope=curvefit_params["fixslope"],
)

fit_params_qc = fits_qc.fitParams(average_only=False, no_average=True)
assert len(fit_params_qc) <= len(
    fits_noqc.fitParams(average_only=False, no_average=True)
)

print(f"Assigning fits for this plate to {group}")
fit_params_qc.insert(0, "group", group)
Assigning fits for this plate to pilot

Plot all the curves that passed QC:

In [21]:
if fits_qc.sera:
    _ = fits_qc.plotReplicates(
        attempt_shared_legend=False,
        legendfontsize=8,
        titlesize=10,
        ticksize=10,
        ncol=6,
        draw_in_bounds=True,
    )
else:
    print("No sera passed QC.")
No description has been provided for this image

Save results to files¶

In [22]:
print(f"Writing fraction infectivities to {frac_infectivity_csv}")
(
    frac_infectivity[
        [
            "serum",
            "strain",
            "plate_barcode",
            "dilution_factor",
            "frac_infectivity_raw",
            "frac_infectivity_ceiling",
        ]
    ]
    .sort_values(["serum", "plate_barcode", "dilution_factor"])
    .to_csv(frac_infectivity_csv, index=False, float_format="%.4g")
)

print(f"\nWriting fit parameters to {fits_csv}")
(
    fit_params_qc.drop(columns=["nreplicates", "ic50_str"]).to_csv(
        fits_csv, index=False, float_format="%.4g"
    )
)

print(f"\nPickling neutcurve.CurveFits object for these data to {fits_pickle}")
with open(fits_pickle, "wb") as f:
    pickle.dump(fits_qc, f)

print(f"\nWriting QC drops to {qc_drops_yaml}")


def tup_to_str(x):
    return " ".join(x) if isinstance(x, tuple) else x


qc_drops_for_yaml = {
    key: {tup_to_str(key2): val2 for key2, val2 in val.items()}
    for key, val in qc_drops.items()
}
with open(qc_drops_yaml, "w") as f:
    yaml.YAML(typ="rt").dump(qc_drops_for_yaml, f)
print("\nHere are the QC drops:\n***************************")
yaml.YAML(typ="rt").dump(qc_drops_for_yaml, sys.stdout)
Writing fraction infectivities to results/plates/H3N2_plate/frac_infectivity.csv

Writing fit parameters to results/plates/H3N2_plate/curvefits.csv

Pickling neutcurve.CurveFits object for these data to results/plates/H3N2_plate/curvefits.pickle

Writing QC drops to results/plates/H3N2_plate/qc_drops.yml

Here are the QC drops:
***************************
wells: {}
barcodes:
  AAAGTAGCAGAGGATT: min_neut_standard_frac_per_well
  AATGACAGCTGTCTAG: min_neut_standard_frac_per_well
  AATGGTCGAGCCATTC: min_neut_standard_frac_per_well
  ACGCAAATAGACCGAA: min_neut_standard_frac_per_well
  AGACCATCGCACCCAA: min_neut_standard_frac_per_well
  ATAGAAAATTATCCGC: min_neut_standard_frac_per_well
  CAAAAGCAGCACGATA: min_neut_standard_frac_per_well
  CAATTCGCCGTTCCCC: min_neut_standard_frac_per_well
  CCAATCCCAGCCTTTA: min_neut_standard_frac_per_well
  CCCGCTAACCCTGTCT: min_neut_standard_frac_per_well
  CCCTCCTCAAGGGTAA: min_neut_standard_frac_per_well
  CCGCAATGACAATTTG: min_neut_standard_frac_per_well
  CGTACGTATGTCCCAG: min_neut_standard_frac_per_well
  CGTCCCTGGCGTGTCG: min_neut_standard_frac_per_well
  CGTTAACGGCCTATCC: min_neut_standard_frac_per_well
  CTCATTACAGAAATTG: min_neut_standard_frac_per_well
  CTCCAATAGGAGACGA: min_neut_standard_frac_per_well
  CTTCATCTCATTTAAA: min_neut_standard_frac_per_well
  TATACTCACGGAGGAT: min_neut_standard_frac_per_well
  TATATGGAATACTAAA: min_neut_standard_frac_per_well
  TCCCCGTGGTTTGACA: min_neut_standard_frac_per_well
  TCCGCCACTATAACAT: min_neut_standard_frac_per_well
  TCTCCGATAGCCCTAC: min_neut_standard_frac_per_well
  TGACAAACACCTGAGG: min_neut_standard_frac_per_well
barcode_wells:
  CCTATAAGGCCTTACG A10: min_no_serum_count_per_viral_barcode_well
  GATTCACGGCCCACAA B10: min_no_serum_count_per_viral_barcode_well
  CACCGACCAACTCTCT E10: min_no_serum_count_per_viral_barcode_well
  GACCAAAGCTGCAGGG E10: min_no_serum_count_per_viral_barcode_well
barcode_serum_replicates:
  AAAGATAAATTCAAAA A23038d0_r32_75K: goodness_of_fit
  AAATGCTGAGAGGGTA A23038d0_r32_75K: goodness_of_fit
  AACGACACTTACATCC A23038d0_r32_75K: goodness_of_fit
  AATAGGCCCAAATCCA A23038d0_r32_75K: goodness_of_fit
  AATCTTTCCAATCTTG A23038d0_r32_75K: goodness_of_fit
  AATTACGCATAGGCCA A23038d0_r32_75K: goodness_of_fit
  ACAAAGATAAAAATTT A23038d0_r32_75K: goodness_of_fit
  AGATCATAAGCAATAA A23038d0_r32_75K: goodness_of_fit
  AGGGACTTTATTGTCC A23038d0_r32_75K: goodness_of_fit
  AGGTGCGAGCCATCAG A23038d0_r32_75K: goodness_of_fit
  ATCGAAAAAACTGCAA A23038d0_r32_75K: goodness_of_fit
  ATCGATTCGATTGACG A23038d0_r32_75K: goodness_of_fit
  ATGGTTATCTTACCTT A23038d0_r32_75K: goodness_of_fit
  ATTAGATTATAACGTA A23038d0_r32_75K: goodness_of_fit
  CACCGACCAACTCTCT A23038d0_r32_75K: goodness_of_fit
  CACGTTAGTGAGACTT A23038d0_r32_75K: goodness_of_fit
  CATAAAAGACTGTATA A23038d0_r32_75K: goodness_of_fit
  CATAATGCACAAACGC A23038d0_r32_75K: goodness_of_fit
  CCACAAGTTTGAAAAC A23038d0_r32_75K: goodness_of_fit
  CCATCACCTTATACAC A23038d0_r32_75K: goodness_of_fit
  CCGATAAGACGTCGCT A23038d0_r32_75K: goodness_of_fit
  CCTATAAGGCCTTACG A23038d0_r32_75K: goodness_of_fit
  CGCGAACAACAGGGGA A23038d0_r32_75K: goodness_of_fit
  CGTCAGAAGTTTATAA A23038d0_r32_75K: goodness_of_fit
  CTATCTTAATCTACAG A23038d0_r32_75K: goodness_of_fit
  CTGCGAATATTGTGAC A23038d0_r32_75K: goodness_of_fit
  CTTGAATACACAAACA A23038d0_r32_75K: goodness_of_fit
  GAAGTAACAAACTATG A23038d0_r32_75K: goodness_of_fit
  GACCAAAAAGCAGTAT A23038d0_r32_75K: goodness_of_fit
  GACCAAAGCTGCAGGG A23038d0_r32_75K: goodness_of_fit
  GATCTGCTTGGAATGT A23038d0_r32_75K: goodness_of_fit
  GCATTATAATCTTGTG A23038d0_r32_75K: goodness_of_fit
  GCTGGTGCACAAGATT A23038d0_r32_75K: goodness_of_fit
  GTTATTATGACTTCAT A23038d0_r32_75K: goodness_of_fit
  TACGAAAATCAAGAGC A23038d0_r32_75K: goodness_of_fit
  TATCAATTCGGTATTA A23038d0_r32_75K: goodness_of_fit
  TATCGCAATATGATAA A23038d0_r32_75K: goodness_of_fit
  TCAATGAATGCGGGGT A23038d0_r32_75K: goodness_of_fit
  TCCACACCCCTAGCTA A23038d0_r32_75K: goodness_of_fit
  TCTTAGAGTGAACGAT A23038d0_r32_75K: goodness_of_fit
  TGAGATCAGCCGGGTG A23038d0_r32_75K: goodness_of_fit
  TGCCGATCCAATTGAT A23038d0_r32_75K: goodness_of_fit
  TGTAGTATAAGAATAA A23038d0_r32_75K: goodness_of_fit
  TTAGCAGTTAACGTAT A23038d0_r32_75K: goodness_of_fit
  TTATGTTTTAATGGTA A23038d0_r32_75K: goodness_of_fit
  TTGACTCACCGAATAA A23038d0_r32_75K: goodness_of_fit
serum_replicates: {}