{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "82629a77", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import os\n", "from tqdm.notebook import tqdm\n", "from scipy.stats import combine_pvalues\n", "from statsmodels.stats.multitest import multipletests\n", "import numpy as np\n", "\n", "tqdm.pandas()" ] }, { "cell_type": "markdown", "id": "63a907ae", "metadata": {}, "source": [ "# ENCODE DNase I allelic imbalance data\n", "\n", "### If you have any questions, please reach us at:\n", "+ sabramov@altius.org Sergey Abramov \n", "+ sboytsov@altius.org Alexandr Boytsov\n" ] }, { "cell_type": "markdown", "id": "9942ab9e", "metadata": {}, "source": [ "# Accessing the data\n", "\n", "The data is stored at https://resources.altius.org/~jvierstra/projects/encode4-allelic-imbalance-v1
\n", "The directory contains:\n", "+ Metadata file: `metadata.tsv`\n", "+ All tested variants: `cav_pvalues.1010.melt.sorted.bed.gz`\n", "+ Readme file: `README.tsv`\n", "+ This notebook" ] }, { "cell_type": "markdown", "id": "153fe2ec", "metadata": {}, "source": [ "## Metadata file\n", "+ `ag_id` - a unique identifier of a sample\n", "+ `idniv_id` - individual genotype id. Samples with the same indiv_id have very similar genotype (not necessarily the same cell type)" ] }, { "cell_type": "code", "execution_count": 2, "id": "db332905", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2022-10-10 22:52:28-- https://resources.altius.org/~jvierstra/projects/encode4-allelic-imbalance-v1/metadata.tsv\n", "Resolving resources.altius.org (resources.altius.org)... 10.174.2.79\n", "Connecting to resources.altius.org (resources.altius.org)|10.174.2.79|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 227346 (222K) [text/tab-separated-values]\n", "Saving to: ‘metadata.tsv.1’\n", "\n", "100%[======================================>] 227,346 --.-K/s in 0.005s \n", "\n", "2022-10-10 22:52:28 (43.0 MB/s) - ‘metadata.tsv.1’ saved [227346/227346]\n", "\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ag_idindiv_idtaxonomy_nameontology_idontology_termsystemsubsystemorganorgan_regionside_position...donor_health_conditionvendor_nameperturbationtreatment_nametreatment_detailsdosedose_unitshotspot2_spotnuclear_percent_duplicationpaired_nuclear_align
070947INDIV_0007h.Macrophage.M2CL:0000862suppressor macrophageHematopoieticMyeloidBloodNaNNaN...NaNFred Hutch Cancer Research CentercontrolMediaRPMI-1640 MediaNaNNaN0.712719.5437242891954.0
170948INDIV_0010h.Macrophage.M2CL:0000862suppressor macrophageHematopoieticMyeloidBloodNaNNaN...NaNFred Hutch Cancer Research CenterbiologicLPSLipopolysaccharide (LPS)NaNNaN0.658614.7689187143892.0
270891INDIV_0029h.Macrophage.M1CL:0000863inflammatory macrophageHematopoieticMyeloidBloodNaNNaN...NaNFred Hutch Cancer Research CentercontrolDMSODMSONaNNaN0.617950.9084250837530.0
370883INDIV_0029h.CD14+CL:0001054CD14-positive monocyteHematopoieticMyeloidBloodNaNNaN...NaNFred Hutch Cancer Research CentercontrolDMSODMSONaNNaN0.277426.0185363144194.0
470847INDIV_0029h.Macrophage.M2CL:0000862suppressor macrophageHematopoieticMyeloidBloodNaNNaN...NaNFred Hutch Cancer Research CenterbiologicLPSLipopolysaccharide (LPS)NaNNaN0.687241.1007240800964.0
..................................................................
84370842INDIV_1830h.Macrophage.M2CL:0000862suppressor macrophageHematopoieticMyeloidBloodNaNNaN...NaNFred Hutch Cancer Research CentercontrolDMSODMSONaNNaN0.478326.4954422797678.0
84470832INDIV_1830h.Macrophage.M1CL:0000863inflammatory macrophageHematopoieticMyeloidBloodNaNNaN...NaNFred Hutch Cancer Research CentercontrolDMSODMSONaNNaN0.523570.2869286890510.0
84570806INDIV_1830h.Macrophage.M2CL:0000862suppressor macrophageHematopoieticMyeloidBloodNaNNaN...NaNFred Hutch Cancer Research CenterbiologicLPSLipopolysaccharide (LPS)NaNNaN0.492668.9087397025484.0
84670833INDIV_1830h.Macrophage.M1CL:0000863inflammatory macrophageHematopoieticMyeloidBloodNaNNaN...NaNFred Hutch Cancer Research CenterbiologicLPSLipopolysaccharide (LPS)NaNNaN0.379340.9481238711810.0
84770878INDIV_1830h.CD14+CL:0001054CD14-positive monocyteHematopoieticMyeloidBloodNaNNaN...NaNFred Hutch Cancer Research CentercontroluntreateduntreatedNaNNaN0.429529.3808203878976.0
\n", "

848 rows × 33 columns

\n", "
" ], "text/plain": [ " ag_id indiv_id taxonomy_name ontology_id ontology_term \\\n", "0 70947 INDIV_0007 h.Macrophage.M2 CL:0000862 suppressor macrophage \n", "1 70948 INDIV_0010 h.Macrophage.M2 CL:0000862 suppressor macrophage \n", "2 70891 INDIV_0029 h.Macrophage.M1 CL:0000863 inflammatory macrophage \n", "3 70883 INDIV_0029 h.CD14+ CL:0001054 CD14-positive monocyte \n", "4 70847 INDIV_0029 h.Macrophage.M2 CL:0000862 suppressor macrophage \n", ".. ... ... ... ... ... \n", "843 70842 INDIV_1830 h.Macrophage.M2 CL:0000862 suppressor macrophage \n", "844 70832 INDIV_1830 h.Macrophage.M1 CL:0000863 inflammatory macrophage \n", "845 70806 INDIV_1830 h.Macrophage.M2 CL:0000862 suppressor macrophage \n", "846 70833 INDIV_1830 h.Macrophage.M1 CL:0000863 inflammatory macrophage \n", "847 70878 INDIV_1830 h.CD14+ CL:0001054 CD14-positive monocyte \n", "\n", " system subsystem organ organ_region side_position ... \\\n", "0 Hematopoietic Myeloid Blood NaN NaN ... \n", "1 Hematopoietic Myeloid Blood NaN NaN ... \n", "2 Hematopoietic Myeloid Blood NaN NaN ... \n", "3 Hematopoietic Myeloid Blood NaN NaN ... \n", "4 Hematopoietic Myeloid Blood NaN NaN ... \n", ".. ... ... ... ... ... ... \n", "843 Hematopoietic Myeloid Blood NaN NaN ... \n", "844 Hematopoietic Myeloid Blood NaN NaN ... \n", "845 Hematopoietic Myeloid Blood NaN NaN ... \n", "846 Hematopoietic Myeloid Blood NaN NaN ... \n", "847 Hematopoietic Myeloid Blood NaN NaN ... \n", "\n", " donor_health_condition vendor_name perturbation \\\n", "0 NaN Fred Hutch Cancer Research Center control \n", "1 NaN Fred Hutch Cancer Research Center biologic \n", "2 NaN Fred Hutch Cancer Research Center control \n", "3 NaN Fred Hutch Cancer Research Center control \n", "4 NaN Fred Hutch Cancer Research Center biologic \n", ".. ... ... ... \n", "843 NaN Fred Hutch Cancer Research Center control \n", "844 NaN Fred Hutch Cancer Research Center control \n", "845 NaN Fred Hutch Cancer Research Center biologic \n", "846 NaN Fred Hutch Cancer Research Center biologic \n", "847 NaN Fred Hutch Cancer Research Center control \n", "\n", " treatment_name treatment_details dose dose_units hotspot2_spot \\\n", "0 Media RPMI-1640 Media NaN NaN 0.7127 \n", "1 LPS Lipopolysaccharide (LPS) NaN NaN 0.6586 \n", "2 DMSO DMSO NaN NaN 0.6179 \n", "3 DMSO DMSO NaN NaN 0.2774 \n", "4 LPS Lipopolysaccharide (LPS) NaN NaN 0.6872 \n", ".. ... ... ... ... ... \n", "843 DMSO DMSO NaN NaN 0.4783 \n", "844 DMSO DMSO NaN NaN 0.5235 \n", "845 LPS Lipopolysaccharide (LPS) NaN NaN 0.4926 \n", "846 LPS Lipopolysaccharide (LPS) NaN NaN 0.3793 \n", "847 untreated untreated NaN NaN 0.4295 \n", "\n", " nuclear_percent_duplication paired_nuclear_align \n", "0 19.5437 242891954.0 \n", "1 14.7689 187143892.0 \n", "2 50.9084 250837530.0 \n", "3 26.0185 363144194.0 \n", "4 41.1007 240800964.0 \n", ".. ... ... \n", "843 26.4954 422797678.0 \n", "844 70.2869 286890510.0 \n", "845 68.9087 397025484.0 \n", "846 40.9481 238711810.0 \n", "847 29.3808 203878976.0 \n", "\n", "[848 rows x 33 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "!wget https://resources.altius.org/~jvierstra/projects/encode4-allelic-imbalance-v1/metadata.tsv\n", "meta = pd.read_table('metadata.tsv')\n", "meta" ] }, { "cell_type": "markdown", "id": "724b814e", "metadata": {}, "source": [ "## Variants format\n", "### Variants are stored in the bed-like format:\n", "+ `#chr, start, end`: genomic position of the SNV, hg38 genome assembly;\n", "+ `ID`: rsSNP ID of the SNV according to the dbSNP build 151;\n", "+ `ref`: reference allele (A,C,G, or T, according to hg38);\n", "+ `alt`: alternative allele;\n", "+ `ref_counts, alt_counts`: number of reads mapped to the alleles;\n", "+ `sample_id`: unique identifier of the sample, corresponds to `ag_id` in metadata file\n", "+ `BAD`: background allelic dosage estimation at the variant. Higher BAD values correspond to the higher contribution of aneuploidy and local copy-number variants. BAD scores serve as a baseline when estimating the statistical significance and the effect size of each tested variant. BAD=1 in case of diploid and BAD=2 in case of triploid.\n", "\n", "+ `es`: Effect size of allelic imbalance at the variant, log2. Positive values indicate imbalance favouring the reference allele. Corrected for BAD.\n", "+ `pval_ref, pval_alt`: allele-wise P-value estimations of one-sided tests. Lower P-values indicate the preference of the corresponding allele, e.g. low `pval_ref` means the variant is reference imbalanced\n" ] }, { "cell_type": "markdown", "id": "c4659153", "metadata": {}, "source": [ "### You can download the variants in a specific region using tabix\n", "`tabix https://resources.altius.org/~jvierstra/projects/encode4-allelic-imbalance-v1/cav_pvalues.1010.melt.sorted.bed.gz ${region} > cav_pvalues_${region}.bed`\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "58e4201e", "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#chrstartendIDrefaltref_countsalt_countssample_idBADespval_refpval_alt
0chr11010810109rs376007522AT1439723891.0-1.4780470.9998660.000401
1chr11010810109rs376007522AT621723861.0-1.8073550.9993980.002938
2chr11046810469rs370233998CG106727371.50.3449800.4325310.837543
3chr11058810589.CT106727552.5-0.5189690.7852260.603204
4chr11058810589.CT55727562.50.0000001.0000001.000000
..........................................
39253chr149780324978033rs6666618AC810799661.5-0.0969430.6296980.558183
39254chr149785634978564rs12564764GA56799941.5-0.1460421.0000000.671661
39255chr149849094984910rs1106585GA1311800311.00.2410080.4196530.729898
39256chr149849694984970rs1106586GA1211800301.50.0085380.5702810.544987
39257chr149849694984970rs1106586GA75800311.00.4854270.4285711.000000
\n", "

39258 rows × 13 columns

\n", "
" ], "text/plain": [ " #chr start end ID ref alt ref_counts alt_counts \\\n", "0 chr1 10108 10109 rs376007522 A T 14 39 \n", "1 chr1 10108 10109 rs376007522 A T 6 21 \n", "2 chr1 10468 10469 rs370233998 C G 10 6 \n", "3 chr1 10588 10589 . C T 10 6 \n", "4 chr1 10588 10589 . C T 5 5 \n", "... ... ... ... ... .. .. ... ... \n", "39253 chr1 4978032 4978033 rs6666618 A C 8 10 \n", "39254 chr1 4978563 4978564 rs12564764 G A 5 6 \n", "39255 chr1 4984909 4984910 rs1106585 G A 13 11 \n", "39256 chr1 4984969 4984970 rs1106586 G A 12 11 \n", "39257 chr1 4984969 4984970 rs1106586 G A 7 5 \n", "\n", " sample_id BAD es pval_ref pval_alt \n", "0 72389 1.0 -1.478047 0.999866 0.000401 \n", "1 72386 1.0 -1.807355 0.999398 0.002938 \n", "2 72737 1.5 0.344980 0.432531 0.837543 \n", "3 72755 2.5 -0.518969 0.785226 0.603204 \n", "4 72756 2.5 0.000000 1.000000 1.000000 \n", "... ... ... ... ... ... \n", "39253 79966 1.5 -0.096943 0.629698 0.558183 \n", "39254 79994 1.5 -0.146042 1.000000 0.671661 \n", "39255 80031 1.0 0.241008 0.419653 0.729898 \n", "39256 80030 1.5 0.008538 0.570281 0.544987 \n", "39257 80031 1.0 0.485427 0.428571 1.000000 \n", "\n", "[39258 rows x 13 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "columns = ['#chr', 'start', 'end', 'ID', 'ref', 'alt', 'ref_counts', 'alt_counts',\n", " 'sample_id', 'BAD', 'es', 'pval_ref', 'pval_alt']\n", "\n", "!tabix https://resources.altius.org/~jvierstra/projects/encode4-allelic-imbalance-v1/cav_pvalues.1010.melt.sorted.bed.gz chr1:1-5000000 > cav_pvalues.chr1_1-5000000.bed\n", "\n", " \n", "tested_variants = pd.read_table('cav_pvalues.chr1_1-5000000.bed', header=None, names=columns)\n", "tested_variants" ] }, { "cell_type": "markdown", "id": "b752e73f", "metadata": {}, "source": [ "### `sample_id` is a unique identifier of the dataset; it corresponds to `ag_id` in metadata file" ] }, { "cell_type": "code", "execution_count": 4, "id": "2568255f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#chrstartendIDrefaltref_countsalt_countssample_idBAD...donor_health_conditionvendor_nameperturbationtreatment_nametreatment_detailsdosedose_unitshotspot2_spotnuclear_percent_duplicationpaired_nuclear_align
0chr11010810109rs376007522AT1439723891.0...NaNStemExpresscytokineIL-12IL-12NaNNaN0.229723.5324278459478.0
1chr1183628183629rs71267774GA179723891.0...NaNStemExpresscytokineIL-12IL-12NaNNaN0.229723.5324278459478.0
2chr1605594605595rs80246094GA238723891.0...NaNStemExpresscytokineIL-12IL-12NaNNaN0.229723.5324278459478.0
3chr1941118941119rs4372192AG1821723891.0...NaNStemExpresscytokineIL-12IL-12NaNNaN0.229723.5324278459478.0
4chr1941766941767rs114982608GA1317723891.0...NaNStemExpresscytokineIL-12IL-12NaNNaN0.229723.5324278459478.0
..................................................................
39253chr138572503857251rs12126653CG610802761.0...NaNNaNnoneuntreateduntreatedNaNNaN0.18332.455228127002.0
39254chr139006953900696rs10909825CT1011802761.0...NaNNaNnoneuntreateduntreatedNaNNaN0.18332.455228127002.0
39255chr139010673901068rs3935779GT1210802761.0...NaNNaNnoneuntreateduntreatedNaNNaN0.18332.455228127002.0
39256chr139010963901097rs3935778CT2113802761.0...NaNNaNnoneuntreateduntreatedNaNNaN0.18332.455228127002.0
39257chr139015343901535rs72849340TG56802761.0...NaNNaNnoneuntreateduntreatedNaNNaN0.18332.455228127002.0
\n", "

39258 rows × 46 columns

\n", "
" ], "text/plain": [ " #chr start end ID ref alt ref_counts alt_counts \\\n", "0 chr1 10108 10109 rs376007522 A T 14 39 \n", "1 chr1 183628 183629 rs71267774 G A 17 9 \n", "2 chr1 605594 605595 rs80246094 G A 23 8 \n", "3 chr1 941118 941119 rs4372192 A G 18 21 \n", "4 chr1 941766 941767 rs114982608 G A 13 17 \n", "... ... ... ... ... .. .. ... ... \n", "39253 chr1 3857250 3857251 rs12126653 C G 6 10 \n", "39254 chr1 3900695 3900696 rs10909825 C T 10 11 \n", "39255 chr1 3901067 3901068 rs3935779 G T 12 10 \n", "39256 chr1 3901096 3901097 rs3935778 C T 21 13 \n", "39257 chr1 3901534 3901535 rs72849340 T G 5 6 \n", "\n", " sample_id BAD ... donor_health_condition vendor_name perturbation \\\n", "0 72389 1.0 ... NaN StemExpress cytokine \n", "1 72389 1.0 ... NaN StemExpress cytokine \n", "2 72389 1.0 ... NaN StemExpress cytokine \n", "3 72389 1.0 ... NaN StemExpress cytokine \n", "4 72389 1.0 ... NaN StemExpress cytokine \n", "... ... ... ... ... ... ... \n", "39253 80276 1.0 ... NaN NaN none \n", "39254 80276 1.0 ... NaN NaN none \n", "39255 80276 1.0 ... NaN NaN none \n", "39256 80276 1.0 ... NaN NaN none \n", "39257 80276 1.0 ... NaN NaN none \n", "\n", " treatment_name treatment_details dose dose_units hotspot2_spot \\\n", "0 IL-12 IL-12 NaN NaN 0.2297 \n", "1 IL-12 IL-12 NaN NaN 0.2297 \n", "2 IL-12 IL-12 NaN NaN 0.2297 \n", "3 IL-12 IL-12 NaN NaN 0.2297 \n", "4 IL-12 IL-12 NaN NaN 0.2297 \n", "... ... ... ... ... ... \n", "39253 untreated untreated NaN NaN 0.1833 \n", "39254 untreated untreated NaN NaN 0.1833 \n", "39255 untreated untreated NaN NaN 0.1833 \n", "39256 untreated untreated NaN NaN 0.1833 \n", "39257 untreated untreated NaN NaN 0.1833 \n", "\n", " nuclear_percent_duplication paired_nuclear_align \n", "0 23.5324 278459478.0 \n", "1 23.5324 278459478.0 \n", "2 23.5324 278459478.0 \n", "3 23.5324 278459478.0 \n", "4 23.5324 278459478.0 \n", "... ... ... \n", "39253 2.4552 28127002.0 \n", "39254 2.4552 28127002.0 \n", "39255 2.4552 28127002.0 \n", "39256 2.4552 28127002.0 \n", "39257 2.4552 28127002.0 \n", "\n", "[39258 rows x 46 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tested_variants.merge(meta, left_on='sample_id', right_on='ag_id')" ] }, { "cell_type": "markdown", "id": "3f4daa8c", "metadata": {}, "source": [ "### Transforming from 'melt' format to pivot table\n", "Conversion from 'long' to 'wide' format, with 7 * number_of_samples columns" ] }, { "cell_type": "code", "execution_count": 5, "id": "ef04330e", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
BADalt_counts...pval_refref_counts
sample_id72386723897273772755727567238672389727377275572756...72386723897273772755727567238672389727377275572756
#chrstartendIDrefalt
chr11010810109rs376007522AT1.01.0NaNNaNNaN21.039.0NaNNaNNaN...0.9993980.999866NaNNaNNaN6.014.0NaNNaNNaN
1046810469rs370233998CGNaNNaN1.5NaNNaNNaNNaN6.0NaNNaN...NaNNaN0.432531NaNNaNNaNNaN10.0NaNNaN
1058810589.CTNaNNaNNaN2.52.5NaNNaNNaN6.05.0...NaNNaNNaN0.7852261.0NaNNaNNaN10.05.0
\n", "

3 rows × 30 columns

\n", "
" ], "text/plain": [ " BAD alt_counts \\\n", "sample_id 72386 72389 72737 72755 72756 72386 \n", "#chr start end ID ref alt \n", "chr1 10108 10109 rs376007522 A T 1.0 1.0 NaN NaN NaN 21.0 \n", " 10468 10469 rs370233998 C G NaN NaN 1.5 NaN NaN NaN \n", " 10588 10589 . C T NaN NaN NaN 2.5 2.5 NaN \n", "\n", " ... pval_ref \\\n", "sample_id 72389 72737 72755 72756 ... 72386 \n", "#chr start end ID ref alt ... \n", "chr1 10108 10109 rs376007522 A T 39.0 NaN NaN NaN ... 0.999398 \n", " 10468 10469 rs370233998 C G NaN 6.0 NaN NaN ... NaN \n", " 10588 10589 . C T NaN NaN 6.0 5.0 ... NaN \n", "\n", " \\\n", "sample_id 72389 72737 72755 72756 \n", "#chr start end ID ref alt \n", "chr1 10108 10109 rs376007522 A T 0.999866 NaN NaN NaN \n", " 10468 10469 rs370233998 C G NaN 0.432531 NaN NaN \n", " 10588 10589 . C T NaN NaN 0.785226 1.0 \n", "\n", " ref_counts \n", "sample_id 72386 72389 72737 72755 72756 \n", "#chr start end ID ref alt \n", "chr1 10108 10109 rs376007522 A T 6.0 14.0 NaN NaN NaN \n", " 10468 10469 rs370233998 C G NaN NaN 10.0 NaN NaN \n", " 10588 10589 . C T NaN NaN NaN 10.0 5.0 \n", "\n", "[3 rows x 30 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Remove head to get melt table on all the data\n", "test = tested_variants.head()\n", "ptable = pd.pivot_table(test, index=['#chr', 'start', 'end', 'ID', 'ref', 'alt'], columns=['sample_id'])\n", "ptable" ] }, { "cell_type": "markdown", "id": "2a66d7bb", "metadata": {}, "source": [ "# Data aggregation\n", "\n", "To increase statistical power data can be aggregated in various ways, see examples below" ] }, { "cell_type": "code", "execution_count": 48, "id": "66dda36e", "metadata": {}, "outputs": [], "source": [ "## Functions to aggregate the data and calculate FDR separetely for ref and alt alleles\n", "alleles = ('ref', 'alt')\n", "group_columns = ['#chr', 'start', 'end', 'ID', 'ref', 'alt']\n", "aggregation_method = 'mudholkar_george' # for the list of available methods see https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.combine_pvalues.html\n", "\n", "def aggregate_snp(snp_df, *args, **kwargs):\n", " pvals = {}\n", " effect_sizes = {}\n", " for allele in alleles:\n", " raw_pvals = snp_df[f\"pval_{allele}\"].to_numpy()\n", " pvals[allele] = logit_aggregate_pvalues(raw_pvals)\n", " # Effect size alt = -1 * Effect size ref\n", " effect_sizes[allele] = aggregate_es(snp_df['es'].tolist(), raw_pvals)\n", " mean_BAD = snp_df['BAD'].mean()\n", " return mean_BAD, pvals, effect_sizes\n", "\n", "def aggregate_apply(df):\n", " new_df = df.drop_duplicates(subset=group_columns).copy()\n", " mean_BAD, pvals, effect_sizes = aggregate_snp(df)\n", " new_df['mean_BAD'] = mean_BAD\n", " for allele in alleles:\n", " new_df[f'pval_ag_{allele}'] = pvals[allele]\n", " \n", " if pvals['ref'] < pvals['alt']:\n", " preference = 'ref'\n", " elif pvals['ref'] > pvals['alt']:\n", " preference = 'alt'\n", " else:\n", " preference = None\n", " if preference is None:\n", " if pd.isna(effect_sizes['ref']) and pd.isna(effect_sizes['alt']):\n", " new_df['es_ag'] = None\n", " elif effect_sizes['ref'] == effect_sizes['alt']:\n", " new_df['es_ag'] = 0\n", " else:\n", " new_df['es_ag'] = effect_sizes['ref']\n", " else:\n", " new_df['es_ag'] = effect_sizes[preference]\n", "# new_df['preference'] = preference\n", " new_df['# of SNPs'] = len(df.index)\n", " new_df['max_coverage'] = df.eval('ref_counts + alt_counts').max()\n", " return new_df[group_columns + ['mean_BAD', '# of SNPs', 'max_coverage', 'es_ag', 'pval_ag_ref', 'pval_ag_alt']]\n", "\n", "\n", "def aggregate_es(es_array, p_array):\n", " res = [(x, y) for x, y in zip(es_array, p_array) if y != 1 and not pd.isna(y)]\n", " if len(res) > 0:\n", " es, p = zip(*res)\n", " weights = [-1 * np.log10(x) for x in p]\n", " es_mean = np.average(es, weights=weights)\n", " else:\n", " es_mean = np.nan\n", " return es_mean\n", "\n", "\n", "def logit_aggregate_pvalues(pval_list):\n", " pvalues = np.array([pvalue for pvalue in pval_list if 1 > pvalue > 0])\n", " if len(pvalues) == 0:\n", " return 1\n", " elif len(pvalues) == 1:\n", " return pvalues[0]\n", " return combine_pvalues(pvalues, method=aggregation_method)[1]\n", "\n", "def calc_fdr(aggr_df):\n", " for allele in alleles:\n", " try:\n", " _, pval_arr, _, _ = multipletests(aggr_df[f'pval_ag_{allele}'], alpha=0.05, method='fdr_bh')\n", " except TypeError:\n", " print(aggr_df, aggr_df[f'pval_ag_{allele}'])\n", " raise\n", " aggr_df[f\"fdrp_bh_{allele}\"] = pval_arr\n", " aggr_df['min_fdrp_bh'] = aggr_df[[\"fdrp_bh_ref\", \"fdrp_bh_alt\"]].min(axis=1)\n", " return aggr_df\n", "\n", "def aggregate_and_fdr(df, group_key=None, with_ray=False, show_bar=False, *args, **kwargs):\n", " if with_ray:\n", " # Parallel code, often works slower than pure pandas\n", " import swifter\n", " return df.swifter.groupby(group_columns, as_index=False, group_keys=False).apply(aggregate_apply)\n", " else:\n", " snp_groups = df.groupby(group_columns, as_index=False, group_keys=False)\n", " if show_bar:\n", " iterator = tqdm(snp_groups)\n", " else:\n", " iterator = snp_groups\n", " result_df = calc_fdr(pd.concat([aggregate_apply(snp_df) for snp_key, snp_df in iterator]))\n", " result_df['group_key'] = group_key\n", " return result_df\n", "\n", "def aggregate_over_column(df, group_column=None):\n", " if group_column is None:\n", " # Aggregate over all the data\n", " return aggregate_and_fdr(df, show_bar=True)\n", " else:\n", " # Groupby over specified column and calculate FDR separetely for each group\n", " return pd.concat([aggregate_and_fdr(g_df, g_name) for g_name, g_df in tqdm(df.groupby(group_column, as_index=False))])" ] }, { "cell_type": "markdown", "id": "fcd61c50", "metadata": {}, "source": [ "## Data can be grouped by various properties and then aggregated within each group.\n", "### For example, we can group by indiv_id (samples with the same genotype), taxonomy_name (i.e. cell type), e.t.c\n", "### Within each group variants at the same genomic position and with the same genotype (ref and alt alleles) are aggregated\n", "### Due to unknown structural haplotypes at each variant, we perform a one-sided test for reference and alternative allele imbalance separately. This yields two significance estimates: one for each of the alleles.\n", "### After aggregation the data has the following format:\n", "+ `#chr, start, end`: genomic position of the SNV, hg38 genome assembly;
\n", "+ `ID`: rsSNP ID of the SNV according to the dbSNP build 151;
\n", "+ `ref`: reference allele (A,C,G, or T, according to hg38);
\n", "+ `alt`: alternative allele\n", "+ `mean_BAD`: average BAD score of the aggregated variants at the same genomic position\n", "+ `# of SNPs`: number of SNVs on the same position being aggregated\n", "+ `max_coverage`: maximal coverage among aggregated variants at the same genomic position\n", "+ `es_ag`: aggregated effect sizes. Calculated as an average of individual effect sizes at the aggregated variants with weights being log10(P-values). N/A if all P-values in the aggregation are 1.\n", "+ `pval_ag_ref, pval_ag_alt`: aggregated pvalues. By default we are using logit (Mudholkar-George) aggregation of pvalues.\n", "+ `fdrp_bh_ref, fdrp_bh_alt`: Benjamini-Hochberg FDR corrected aggregated p-values for tested SNVs from the datasets in the same group\n", "+ `min_fdrp_bh`: minimum of `fdrp_bh_ref` and `fdrp_bh_alt`\n", "+ `group_key`: name of the aggregation group, e.g. cell type\n", "\n" ] }, { "cell_type": "markdown", "id": "72447ea7", "metadata": {}, "source": [ "## Examples of data aggregation" ] }, { "cell_type": "markdown", "id": "2a24bd1e", "metadata": {}, "source": [ "### Pull the data from the server" ] }, { "cell_type": "code", "execution_count": 13, "id": "52bfa9d1", "metadata": {}, "outputs": [], "source": [ "columns = ['#chr', 'start', 'end', 'ID', 'ref', 'alt', 'ref_counts', 'alt_counts',\n", " 'sample_id', 'BAD', 'es', 'pval_ref', 'pval_alt']\n", "\n", "# get variants in chr1:1-5000000\n", "!tabix https://resources.altius.org/~jvierstra/projects/encode4-allelic-imbalance-v1/cav_pvalues.1010.melt.sorted.bed.gz chr1:1-5000000 > cav_pvalues.chr1_1-5000000.bed\n", " \n", "tested_variants = pd.read_table('cav_pvalues.chr1_1-5000000.bed', header=None, names=columns)\n", "\n", "# filter by coverage >= 20\n", "tested_variants = tested_variants[tested_variants.eval('ref_counts + alt_counts >= 20')]" ] }, { "cell_type": "markdown", "id": "dd01f268", "metadata": {}, "source": [ "### Aggregate all the data" ] }, { "cell_type": "code", "execution_count": 49, "id": "86125be7", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "f1056f48111347b3b332b93fdfbe1135", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/3731 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#chrstartendIDrefaltmean_BAD# of SNPsmax_coveragees_agpval_ag_refpval_ag_altfdrp_bh_reffdrp_bh_altmin_fdrp_bhgroup_key
0chr11010810109rs376007522AT1.000000253-1.6186840.9999962.526052e-051.0000000.0026930.002693None
10chr1102950102951rs200298631CT1.0000001351.3219280.0083379.970077e-010.1346511.0000000.134651None
18chr1181112181113rs1383295402AG1.000000128-2.2016341.0000004.424427e-041.0000000.0250110.025011None
19chr1181320181321rs1191715520CG1.66666761041.2951870.1813749.634122e-010.8986791.0000000.898679None
30chr1181383181384rs1258951683GC1.5000001311.4744590.0123639.970303e-010.1780991.0000000.178099None
...................................................
39243chr149777514977752rs6666330AG1.500000141-0.5246180.8947711.776924e-011.0000001.0000001.000000None
39247chr149779904977991rs12739724GA1.5000003148-1.1311600.9999722.155024e-041.0000000.0154620.015462None
39250chr149780324978033rs6666618AC1.5000003162-1.1744311.0000007.905153e-081.0000000.0000140.000014None
39255chr149849094984910rs1106585GA1.0000001240.2410080.4196537.298979e-011.0000001.0000001.000000None
39256chr149849694984970rs1106586GA1.5000001230.0085380.5702815.449869e-011.0000001.0000001.000000None
\n", "

3731 rows × 16 columns

\n", "" ], "text/plain": [ " #chr start end ID ref alt mean_BAD # of SNPs \\\n", "0 chr1 10108 10109 rs376007522 A T 1.000000 2 \n", "10 chr1 102950 102951 rs200298631 C T 1.000000 1 \n", "18 chr1 181112 181113 rs1383295402 A G 1.000000 1 \n", "19 chr1 181320 181321 rs1191715520 C G 1.666667 6 \n", "30 chr1 181383 181384 rs1258951683 G C 1.500000 1 \n", "... ... ... ... ... .. .. ... ... \n", "39243 chr1 4977751 4977752 rs6666330 A G 1.500000 1 \n", "39247 chr1 4977990 4977991 rs12739724 G A 1.500000 3 \n", "39250 chr1 4978032 4978033 rs6666618 A C 1.500000 3 \n", "39255 chr1 4984909 4984910 rs1106585 G A 1.000000 1 \n", "39256 chr1 4984969 4984970 rs1106586 G A 1.500000 1 \n", "\n", " max_coverage es_ag pval_ag_ref pval_ag_alt fdrp_bh_ref \\\n", "0 53 -1.618684 0.999996 2.526052e-05 1.000000 \n", "10 35 1.321928 0.008337 9.970077e-01 0.134651 \n", "18 28 -2.201634 1.000000 4.424427e-04 1.000000 \n", "19 104 1.295187 0.181374 9.634122e-01 0.898679 \n", "30 31 1.474459 0.012363 9.970303e-01 0.178099 \n", "... ... ... ... ... ... \n", "39243 41 -0.524618 0.894771 1.776924e-01 1.000000 \n", "39247 148 -1.131160 0.999972 2.155024e-04 1.000000 \n", "39250 162 -1.174431 1.000000 7.905153e-08 1.000000 \n", "39255 24 0.241008 0.419653 7.298979e-01 1.000000 \n", "39256 23 0.008538 0.570281 5.449869e-01 1.000000 \n", "\n", " fdrp_bh_alt min_fdrp_bh group_key \n", "0 0.002693 0.002693 None \n", "10 1.000000 0.134651 None \n", "18 0.025011 0.025011 None \n", "19 1.000000 0.898679 None \n", "30 1.000000 0.178099 None \n", "... ... ... ... \n", "39243 1.000000 1.000000 None \n", "39247 0.015462 0.015462 None \n", "39250 0.000014 0.000014 None \n", "39255 1.000000 1.000000 None \n", "39256 1.000000 1.000000 None \n", "\n", "[3731 rows x 16 columns]" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "aggregated_data = aggregate_over_column(tested_variants, None)\n", "aggregated_data" ] }, { "cell_type": "markdown", "id": "1f386023", "metadata": {}, "source": [ "### Aggregate data within each sample independently" ] }, { "cell_type": "code", "execution_count": 50, "id": "7081782c", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "eddf82a279dc45cf8fe7117106c7a541", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/821 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#chrstartendIDrefaltmean_BAD# of SNPsmax_coveragees_agpval_ag_refpval_ag_altfdrp_bh_reffdrp_bh_altmin_fdrp_bhgroup_key
8655chr113722571372258rs2649597TC1.01211.0000000.0942880.9642670.9428830.9642670.94288358668
8829chr113752871375288rs2242398AC1.0158-0.0995360.6529980.4478420.9894340.8956830.89568358668
8942chr113755431375544rs2765033TC1.0184-0.2756340.8369330.2226020.9894340.5565060.55650658668
8981chr113755941375595rs2649594CT1.0136-0.4854270.8785080.2025160.9894340.5565060.55650658668
18700chr122321282232129rs263533CT1.01200.0000000.5910640.5910640.9894340.9195640.91956458668
...................................................
18795chr122321282232129rs263533CT1.0140-0.2895070.7852050.3179140.9854260.6358280.63582880833
19418chr122556452255646rs260508TG1.0196-0.2410080.8208010.2375760.9854260.5543440.55434480833
23036chr125470712547072rs2477678TA1.011290.0671140.4301550.6375730.9854260.9323070.93230780833
23830chr125784282578429rs140567883CG1.01760.4594320.1033680.9323070.9100490.9323070.91004980833
33385chr138571683857169rs13374773CA1.01260.2223920.4225970.7215820.9854260.9323070.93230780833
\n", "

24384 rows × 16 columns

\n", "" ], "text/plain": [ " #chr start end ID ref alt mean_BAD # of SNPs \\\n", "8655 chr1 1372257 1372258 rs2649597 T C 1.0 1 \n", "8829 chr1 1375287 1375288 rs2242398 A C 1.0 1 \n", "8942 chr1 1375543 1375544 rs2765033 T C 1.0 1 \n", "8981 chr1 1375594 1375595 rs2649594 C T 1.0 1 \n", "18700 chr1 2232128 2232129 rs263533 C T 1.0 1 \n", "... ... ... ... ... .. .. ... ... \n", "18795 chr1 2232128 2232129 rs263533 C T 1.0 1 \n", "19418 chr1 2255645 2255646 rs260508 T G 1.0 1 \n", "23036 chr1 2547071 2547072 rs2477678 T A 1.0 1 \n", "23830 chr1 2578428 2578429 rs140567883 C G 1.0 1 \n", "33385 chr1 3857168 3857169 rs13374773 C A 1.0 1 \n", "\n", " max_coverage es_ag pval_ag_ref pval_ag_alt fdrp_bh_ref \\\n", "8655 21 1.000000 0.094288 0.964267 0.942883 \n", "8829 58 -0.099536 0.652998 0.447842 0.989434 \n", "8942 84 -0.275634 0.836933 0.222602 0.989434 \n", "8981 36 -0.485427 0.878508 0.202516 0.989434 \n", "18700 20 0.000000 0.591064 0.591064 0.989434 \n", "... ... ... ... ... ... \n", "18795 40 -0.289507 0.785205 0.317914 0.985426 \n", "19418 96 -0.241008 0.820801 0.237576 0.985426 \n", "23036 129 0.067114 0.430155 0.637573 0.985426 \n", "23830 76 0.459432 0.103368 0.932307 0.910049 \n", "33385 26 0.222392 0.422597 0.721582 0.985426 \n", "\n", " fdrp_bh_alt min_fdrp_bh group_key \n", "8655 0.964267 0.942883 58668 \n", "8829 0.895683 0.895683 58668 \n", "8942 0.556506 0.556506 58668 \n", "8981 0.556506 0.556506 58668 \n", "18700 0.919564 0.919564 58668 \n", "... ... ... ... \n", "18795 0.635828 0.635828 80833 \n", "19418 0.554344 0.554344 80833 \n", "23036 0.932307 0.932307 80833 \n", "23830 0.932307 0.910049 80833 \n", "33385 0.932307 0.932307 80833 \n", "\n", "[24384 rows x 16 columns]" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "aggregated_data = aggregate_over_column(tested_variants, 'sample_id')\n", "aggregated_data" ] }, { "cell_type": "markdown", "id": "8b25ac6e", "metadata": {}, "source": [ "### Aggregate data within each cell type" ] }, { "cell_type": "code", "execution_count": 51, "id": "3b3561b9", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ea489cec7f6e41ecbeb6352a3961173c", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/179 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#chrstartendIDrefaltmean_BAD# of SNPsmax_coveragees_agpval_ag_refpval_ag_altfdrp_bh_reffdrp_bh_altmin_fdrp_bhgroup_key
2522chr1184460184461rs1169715936GA1.01231.8479970.0050751.0000000.0913251.0000000.091325HMEC
2523chr1778533778534rs570631764AG1.01300.0000000.5722470.5722470.8865961.0000000.886596HMEC
2524chr1778638778639rs114983708AG1.01331.4150370.0067650.9977300.0913251.0000000.091325HMEC
2525chr1827251827252rs3131948TA1.01201.5849630.0195471.0000000.1319421.0000000.131942HMEC
2526chr1827434827435rs569781818GA1.01400.8930850.0403450.9807610.2178641.0000000.217864HMEC
...................................................
21226chr138575733857574rs12723005TC1.5154-0.1686140.7279810.3528760.8814270.9020080.881427iTH2
21227chr138577313857732rs12722891AG1.5135-0.9467380.9742830.0574320.9996490.7466210.746621iTH2
21228chr138577473857748rs12726906TC1.5127-1.2250580.9890940.0409770.9996490.7102740.710274iTH2
21229chr142675664267567rs115393370GA1.51490.7372000.0664060.9647490.5755160.9967950.575516iTH2
21230chr143797564379757rs2926482CG1.51400.0000000.5277140.5277140.8336660.9020080.833666iTH2
\n", "

14450 rows × 16 columns

\n", "" ], "text/plain": [ " #chr start end ID ref alt mean_BAD # of SNPs \\\n", "2522 chr1 184460 184461 rs1169715936 G A 1.0 1 \n", "2523 chr1 778533 778534 rs570631764 A G 1.0 1 \n", "2524 chr1 778638 778639 rs114983708 A G 1.0 1 \n", "2525 chr1 827251 827252 rs3131948 T A 1.0 1 \n", "2526 chr1 827434 827435 rs569781818 G A 1.0 1 \n", "... ... ... ... ... .. .. ... ... \n", "21226 chr1 3857573 3857574 rs12723005 T C 1.5 1 \n", "21227 chr1 3857731 3857732 rs12722891 A G 1.5 1 \n", "21228 chr1 3857747 3857748 rs12726906 T C 1.5 1 \n", "21229 chr1 4267566 4267567 rs115393370 G A 1.5 1 \n", "21230 chr1 4379756 4379757 rs2926482 C G 1.5 1 \n", "\n", " max_coverage es_ag pval_ag_ref pval_ag_alt fdrp_bh_ref \\\n", "2522 23 1.847997 0.005075 1.000000 0.091325 \n", "2523 30 0.000000 0.572247 0.572247 0.886596 \n", "2524 33 1.415037 0.006765 0.997730 0.091325 \n", "2525 20 1.584963 0.019547 1.000000 0.131942 \n", "2526 40 0.893085 0.040345 0.980761 0.217864 \n", "... ... ... ... ... ... \n", "21226 54 -0.168614 0.727981 0.352876 0.881427 \n", "21227 35 -0.946738 0.974283 0.057432 0.999649 \n", "21228 27 -1.225058 0.989094 0.040977 0.999649 \n", "21229 49 0.737200 0.066406 0.964749 0.575516 \n", "21230 40 0.000000 0.527714 0.527714 0.833666 \n", "\n", " fdrp_bh_alt min_fdrp_bh group_key \n", "2522 1.000000 0.091325 HMEC \n", "2523 1.000000 0.886596 HMEC \n", "2524 1.000000 0.091325 HMEC \n", "2525 1.000000 0.131942 HMEC \n", "2526 1.000000 0.217864 HMEC \n", "... ... ... ... \n", "21226 0.902008 0.881427 iTH2 \n", "21227 0.746621 0.746621 iTH2 \n", "21228 0.710274 0.710274 iTH2 \n", "21229 0.996795 0.575516 iTH2 \n", "21230 0.902008 0.833666 iTH2 \n", "\n", "[14450 rows x 16 columns]" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tested_variants_and_metadata = tested_variants.merge(meta, left_on='sample_id', right_on='ag_id')\n", "aggregated_data = aggregate_over_column(tested_variants_and_metadata, 'taxonomy_name')\n", "aggregated_data" ] }, { "cell_type": "code", "execution_count": 55, "id": "f321971f", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
#chrstartendIDrefaltmean_BAD# of SNPsmax_coveragees_agpval_ag_refpval_ag_altfdrp_bh_reffdrp_bh_altmin_fdrp_bhgroup_key
20120chr116591131659114rs7534581AG1.51297-1.2506511.0000002.629178e-111.0000008.150450e-108.150450e-10HMVEC-LBl
20121chr116592081659209rs150671568CA1.01435-1.0450101.0000001.864821e-131.0000001.156189e-111.156189e-11HMVEC-LBl
20122chr116592191659220rs9661500AG1.01332-1.0130561.0000003.886425e-101.0000008.031945e-098.031945e-09HMVEC-LBl
20125chr119197451919746rs113425481GT1.5136-2.0473371.0000006.320732e-041.0000006.531423e-036.531423e-03HMVEC-LBl
20130chr122957152295716rs4648630GA1.0145-2.0000000.9999923.287288e-051.0000005.095297e-045.095297e-04HMVEC-LBl
...................................................
8673chr123230692323070rs61762082CT1.0186-1.1277560.9998423.658550e-041.0000003.036597e-023.036597e-02h.weri-Rb-1
8688chr125867742586775rs371199788GA1.011181.0740010.0000699.999685e-010.0057379.999685e-015.736898e-03h.weri-Rb-1
5768chr1778596778597rs74512038CT1.01244-1.3719691.0000001.634014e-121.0000003.513130e-113.513130e-11iTH1
5769chr1827475827476rs142969342GA1.01324-2.1081821.0000002.486780e-311.0000001.069315e-291.069315e-29iTH1
5773chr113253521325353rs307346AC1.01331.6438560.0022759.993461e-010.0244759.996379e-012.447537e-02iTH1
\n", "

358 rows × 16 columns

\n", "
" ], "text/plain": [ " #chr start end ID ref alt mean_BAD # of SNPs \\\n", "20120 chr1 1659113 1659114 rs7534581 A G 1.5 1 \n", "20121 chr1 1659208 1659209 rs150671568 C A 1.0 1 \n", "20122 chr1 1659219 1659220 rs9661500 A G 1.0 1 \n", "20125 chr1 1919745 1919746 rs113425481 G T 1.5 1 \n", "20130 chr1 2295715 2295716 rs4648630 G A 1.0 1 \n", "... ... ... ... ... .. .. ... ... \n", "8673 chr1 2323069 2323070 rs61762082 C T 1.0 1 \n", "8688 chr1 2586774 2586775 rs371199788 G A 1.0 1 \n", "5768 chr1 778596 778597 rs74512038 C T 1.0 1 \n", "5769 chr1 827475 827476 rs142969342 G A 1.0 1 \n", "5773 chr1 1325352 1325353 rs307346 A C 1.0 1 \n", "\n", " max_coverage es_ag pval_ag_ref pval_ag_alt fdrp_bh_ref \\\n", "20120 297 -1.250651 1.000000 2.629178e-11 1.000000 \n", "20121 435 -1.045010 1.000000 1.864821e-13 1.000000 \n", "20122 332 -1.013056 1.000000 3.886425e-10 1.000000 \n", "20125 36 -2.047337 1.000000 6.320732e-04 1.000000 \n", "20130 45 -2.000000 0.999992 3.287288e-05 1.000000 \n", "... ... ... ... ... ... \n", "8673 86 -1.127756 0.999842 3.658550e-04 1.000000 \n", "8688 118 1.074001 0.000069 9.999685e-01 0.005737 \n", "5768 244 -1.371969 1.000000 1.634014e-12 1.000000 \n", "5769 324 -2.108182 1.000000 2.486780e-31 1.000000 \n", "5773 33 1.643856 0.002275 9.993461e-01 0.024475 \n", "\n", " fdrp_bh_alt min_fdrp_bh group_key \n", "20120 8.150450e-10 8.150450e-10 HMVEC-LBl \n", "20121 1.156189e-11 1.156189e-11 HMVEC-LBl \n", "20122 8.031945e-09 8.031945e-09 HMVEC-LBl \n", "20125 6.531423e-03 6.531423e-03 HMVEC-LBl \n", "20130 5.095297e-04 5.095297e-04 HMVEC-LBl \n", "... ... ... ... \n", "8673 3.036597e-02 3.036597e-02 h.weri-Rb-1 \n", "8688 9.999685e-01 5.736898e-03 h.weri-Rb-1 \n", "5768 3.513130e-11 3.513130e-11 iTH1 \n", "5769 1.069315e-29 1.069315e-29 iTH1 \n", "5773 9.996379e-01 2.447537e-02 iTH1 \n", "\n", "[358 rows x 16 columns]" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# get significantly imbalanced variants with higher than two-fold effect size\n", "aggregated_data[aggregated_data.eval('min_fdrp_bh < 0.05 & abs(es_ag) >= 1')]" ] }, { "cell_type": "code", "execution_count": null, "id": "1e1dc293", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 5 }