From 75876f71143a715e256a47960fcf77a7f5263003 Mon Sep 17 00:00:00 2001 From: Daniel Zerbino Date: Tue, 1 Nov 2016 14:46:34 +0000 Subject: [PATCH] Adding PCHIC function; closes #3 --- Makefile | 10 ++++- install_dependencies.sh | 8 ++++ scripts/gwas_to_genes.py | 91 ++++++++++++++++++++++++++++++++++++++-- 3 files changed, 104 insertions(+), 5 deletions(-) diff --git a/Makefile b/Makefile index 2e20798..fb7e0b3 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ DEST_DIR=~/lustre2/CTTV24/databases_test default: download process -download: create_dir d_GRASP d_Phewas_Catalog d_GWAS_DB d_Fantom5 d_DHS d_Regulome d_1000Genomes +download: create_dir d_GRASP d_Phewas_Catalog d_GWAS_DB d_Fantom5 d_DHS d_Regulome d_pchic d_1000Genomes process: GRASP Phewas_Catalog GWAS_DB Fantom5 DHS Regulome tabix 1000Genomes clean_raw: @@ -58,6 +58,14 @@ d_Regulome: Regulome: gzip -dc ${DEST_DIR}/raw/regulome[123].csv.gz | sed -e 's/^chr//' | awk 'BEGIN {FS="\t"; OFS="\t"} { print $$1,$$2,$$2 + 1,$$5 }' | sort -k1,1 -k2,2n > ${DEST_DIR}/Regulome.bed +d_pchic: + mkdir -p ${DEST_DIR}/raw/pchic + wget -q -O - ftp://ftp.ensembl.org/pub/grch37/update/gtf/homo_sapiens/Homo_sapiens.GRCh37.82.gtf.gz | gzip -dc | awk 'BEGIN {OFS="\t"} $$3 == "gene" && $$7== "+" { print $$1, $$4, $$4+1, $$10 } $$3 == "gene" && $$7== "-" { print $$1, $$5, $$5+1, $$10 } ' | tr -d '";' > ${DEST_DIR}/raw/Ensembl_TSSs.bed + wget -q -O - ftp://ftp.ebi.ac.uk/pub/contrib/pchic/CHiCAGO/ | tr '<' '\n' | grep '.gz"' | sed -e 's/A HREF="//' -e 's/".*//' | sort | uniq | xargs -n1 -I file wget ftp://ftp.ebi.ac.uk/pub/contrib/pchic/CHiCAGO/file -P ${DEST_DIR}/raw/pchic + +pchic: + gzip -dc ${DEST_DIR}/raw/pchic/* | sed -e 's/\ ${DEST_DIR}/pchic.bed + tabix: bgz $(eval bgz_files := $(wildcard ${DEST_DIR}/*.bed.gz)) $(foreach file, $(bgz_files), tabix -p bed $(file);) diff --git a/install_dependencies.sh b/install_dependencies.sh index d1d827e..446eb05 100644 --- a/install_dependencies.sh +++ b/install_dependencies.sh @@ -36,6 +36,14 @@ make cp src/cpp/vcftools ../bin cd .. +# bedtools +git clone https://github.com/arq5x/bedtools2.git +git checkout v2.26.0 +cd bedtools2 +make +cp bin/bedtools ../bin +cd .. + # pybedtools v0.7.8 pip install pybedtools diff --git a/scripts/gwas_to_genes.py b/scripts/gwas_to_genes.py index c42501d..7d37ae3 100644 --- a/scripts/gwas_to_genes.py +++ b/scripts/gwas_to_genes.py @@ -1531,6 +1531,64 @@ def get_dhs_evidence(feature, fdr_model, snp_hash): tissue = None ) +def PCHIC(ld_snps, tissues): + """ + + Returns all genes associated to a set of SNPs in PCHIC + Args: + * [ SNP ] + * [ string ] (tissues) + Returntype: [ Regulatory_Evidence ] + + """ + intersection = overlap_snps_to_bed(ld_snps, DATABASES_DIR + "/pchic.bed") + snp_hash = dict( (snp.rsID, snp) for snp in ld_snps) + res = filter (lambda X: X is not None and X.score, (get_pchic_evidence(feature, snp_hash) for feature in intersection)) + + if DEBUG: + print "\tFound %i gene associations in PCHIC" % len(res) + + return res + +def get_pchic_evidence(feature, snp_hash): + """ + + Parse output of bedtools searching for PCHIC evidence + Args: + * string + Returntype: Regulatory_Evidence + + """ + ''' + + First 5 columns are from PCHIC file, the next 4 are LD SNP coords + Format of PCHIC correlation files: + 1. chrom + 2. chromStart + 3. chromEnd + 4. score + 5. gene_id + + 6. chrom + 7. start + 8. end + 9. rsID + + ''' + gene = get_gene(feature[4]) + if gene is None: + return None + snp = snp_hash[feature[8]] + + return Cisregulatory_Evidence( + snp = snp, + gene = gene, + score = 1, + source = "PCHIC", + study = None, + tissue = None + ) + def STOPGAP_FDR(snp, gene, fdr_model): """ @@ -1576,7 +1634,10 @@ def get_gene(gene_name): """ if gene_name not in known_genes: - known_genes[gene_name] = fetch_gene(gene_name) + if gene_name[:4] != 'ENSG': + known_genes[gene_name] = fetch_gene(gene_name) + else: + known_genes[gene_name] = fetch_gene_id(gene_name) return known_genes[gene_name] def fetch_gene(gene_name): @@ -1601,6 +1662,28 @@ def fetch_gene(gene_name): except: return None +def fetch_gene_id(gene_id): + """ + + Get gene details from name + * string + Returntype: Gene + + """ + server = "http://grch37.rest.ensembl.org" + ext = "/lookup/id/%s?content-type=application/json;expand=1" % (gene_id) + try: + hash = get_rest_json(server, ext) + return Gene( + name = hash['display_name'], + id = hash['id'], + chrom = hash['seq_region_name'], + tss = int(hash['start']) if hash['strand'] > 0 else int(hash['end']), + biotype = hash['biotype'] + ) + except: + return None + def get_ensembl_gene(ensembl_id): """ @@ -2021,7 +2104,7 @@ def pretty_output(associations): Returntype: String """ - header = "\t".join(['ld_snp_rsID', 'gene_symbol', 'gene_id', 'disease_names', 'disease_efo_ids', 'score', 'gwas_snp_ids', 'GWAS Catalog', 'GRASP', 'GWAS DB', 'Phewas Catalog', 'VEP', 'Regulome', 'GTEx', 'DHS', 'Fantom5']) + header = "\t".join(['ld_snp_rsID', 'gene_symbol', 'gene_id', 'disease_names', 'disease_efo_ids', 'score', 'gwas_snp_ids', 'GWAS Catalog', 'GRASP', 'GWAS DB', 'Phewas Catalog', 'VEP', 'Regulome', 'GTEx', 'DHS', 'Fantom5', 'PCHIC']) content = map(pretty_cluster_association, associations) return "\n".join([header] + content) @@ -2061,14 +2144,14 @@ def pretty_cluster_association(association): results.append(",".join(gwas_snp.snp.rsID for gwas_snp in gwas_snps)) for gwas_source in ['GWAS Catalog', 'GRASP', 'GWAS DB', 'Phewas Catalog']: results.append(",".join(str(gwas_scores[gwas_source][gwas_snp.snp.rsID]) for gwas_snp in cluster.gwas_snps)) - results += [str(functional_scores[ld_snp.rsID][functional_source]) for functional_source in ['VEP', 'Regulome', 'GTEx', 'DHS', 'Fantom5']] + results += [str(functional_scores[ld_snp.rsID][functional_source]) for functional_source in ['VEP', 'Regulome', 'GTEx', 'DHS', 'Fantom5', 'PCHIC']] pretty_strings.append("\t".join(results)) return "\n".join(pretty_strings) # List of databases used database_functions = [GWASCatalog, GWAS_DB, Phewas_Catalog, GRASP] -ld_snp_to_gene_functions = [GTEx, Fantom5, VEP, DHS] +ld_snp_to_gene_functions = [GTEx, Fantom5, VEP, DHS, PCHIC] snp_regulatory_functions = [Regulome] # TODO Implement and insert GERP code if __name__ == "__main__":