Skip to content
This repository has been archived by the owner on Feb 28, 2023. It is now read-only.

Commit

Permalink
Adding PCHIC function; closes #3
Browse files Browse the repository at this point in the history
  • Loading branch information
dzerbino committed Nov 4, 2016
1 parent 1ddadbe commit 75876f7
Show file tree
Hide file tree
Showing 3 changed files with 104 additions and 5 deletions.
10 changes: 9 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ DEST_DIR=~/lustre2/CTTV24/databases_test


default: download process
download: create_dir d_GRASP d_Phewas_Catalog d_GWAS_DB d_Fantom5 d_DHS d_Regulome d_1000Genomes
download: create_dir d_GRASP d_Phewas_Catalog d_GWAS_DB d_Fantom5 d_DHS d_Regulome d_pchic d_1000Genomes
process: GRASP Phewas_Catalog GWAS_DB Fantom5 DHS Regulome tabix 1000Genomes

clean_raw:
Expand Down Expand Up @@ -58,6 +58,14 @@ d_Regulome:
Regulome:
gzip -dc ${DEST_DIR}/raw/regulome[123].csv.gz | sed -e 's/^chr//' | awk 'BEGIN {FS="\t"; OFS="\t"} { print $$1,$$2,$$2 + 1,$$5 }' | sort -k1,1 -k2,2n > ${DEST_DIR}/Regulome.bed

d_pchic:
mkdir -p ${DEST_DIR}/raw/pchic
wget -q -O - ftp://ftp.ensembl.org/pub/grch37/update/gtf/homo_sapiens/Homo_sapiens.GRCh37.82.gtf.gz | gzip -dc | awk 'BEGIN {OFS="\t"} $$3 == "gene" && $$7== "+" { print $$1, $$4, $$4+1, $$10 } $$3 == "gene" && $$7== "-" { print $$1, $$5, $$5+1, $$10 } ' | tr -d '";' > ${DEST_DIR}/raw/Ensembl_TSSs.bed
wget -q -O - ftp://ftp.ebi.ac.uk/pub/contrib/pchic/CHiCAGO/ | tr '<' '\n' | grep '.gz"' | sed -e 's/A HREF="//' -e 's/".*//' | sort | uniq | xargs -n1 -I file wget ftp://ftp.ebi.ac.uk/pub/contrib/pchic/CHiCAGO/file -P ${DEST_DIR}/raw/pchic

pchic:
gzip -dc ${DEST_DIR}/raw/pchic/* | sed -e 's/\<chr//g' | tr ':\-,' '\t' | bedtools intersect -wa -wb -a stdin -b ${DEST_DIR}/raw/Ensembl_TSSs.bed | cut -f4,5,6,13,7 | sort -k1,1 -k2,2n > ${DEST_DIR}/pchic.bed

tabix: bgz
$(eval bgz_files := $(wildcard ${DEST_DIR}/*.bed.gz))
$(foreach file, $(bgz_files), tabix -p bed $(file);)
Expand Down
8 changes: 8 additions & 0 deletions install_dependencies.sh
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,14 @@ make
cp src/cpp/vcftools ../bin
cd ..

# bedtools
git clone https://github.com/arq5x/bedtools2.git
git checkout v2.26.0
cd bedtools2
make
cp bin/bedtools ../bin
cd ..

# pybedtools v0.7.8
pip install pybedtools

Expand Down
91 changes: 87 additions & 4 deletions scripts/gwas_to_genes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1531,6 +1531,64 @@ def get_dhs_evidence(feature, fdr_model, snp_hash):
tissue = None
)

def PCHIC(ld_snps, tissues):
"""
Returns all genes associated to a set of SNPs in PCHIC
Args:
* [ SNP ]
* [ string ] (tissues)
Returntype: [ Regulatory_Evidence ]
"""
intersection = overlap_snps_to_bed(ld_snps, DATABASES_DIR + "/pchic.bed")
snp_hash = dict( (snp.rsID, snp) for snp in ld_snps)
res = filter (lambda X: X is not None and X.score, (get_pchic_evidence(feature, snp_hash) for feature in intersection))

if DEBUG:
print "\tFound %i gene associations in PCHIC" % len(res)

return res

def get_pchic_evidence(feature, snp_hash):
"""
Parse output of bedtools searching for PCHIC evidence
Args:
* string
Returntype: Regulatory_Evidence
"""
'''
First 5 columns are from PCHIC file, the next 4 are LD SNP coords
Format of PCHIC correlation files:
1. chrom
2. chromStart
3. chromEnd
4. score
5. gene_id
6. chrom
7. start
8. end
9. rsID
'''
gene = get_gene(feature[4])
if gene is None:
return None
snp = snp_hash[feature[8]]

return Cisregulatory_Evidence(
snp = snp,
gene = gene,
score = 1,
source = "PCHIC",
study = None,
tissue = None
)

def STOPGAP_FDR(snp, gene, fdr_model):
"""
Expand Down Expand Up @@ -1576,7 +1634,10 @@ def get_gene(gene_name):
"""
if gene_name not in known_genes:
known_genes[gene_name] = fetch_gene(gene_name)
if gene_name[:4] != 'ENSG':
known_genes[gene_name] = fetch_gene(gene_name)
else:
known_genes[gene_name] = fetch_gene_id(gene_name)
return known_genes[gene_name]

def fetch_gene(gene_name):
Expand All @@ -1601,6 +1662,28 @@ def fetch_gene(gene_name):
except:
return None

def fetch_gene_id(gene_id):
"""
Get gene details from name
* string
Returntype: Gene
"""
server = "http://grch37.rest.ensembl.org"
ext = "/lookup/id/%s?content-type=application/json;expand=1" % (gene_id)
try:
hash = get_rest_json(server, ext)
return Gene(
name = hash['display_name'],
id = hash['id'],
chrom = hash['seq_region_name'],
tss = int(hash['start']) if hash['strand'] > 0 else int(hash['end']),
biotype = hash['biotype']
)
except:
return None


def get_ensembl_gene(ensembl_id):
"""
Expand Down Expand Up @@ -2021,7 +2104,7 @@ def pretty_output(associations):
Returntype: String
"""
header = "\t".join(['ld_snp_rsID', 'gene_symbol', 'gene_id', 'disease_names', 'disease_efo_ids', 'score', 'gwas_snp_ids', 'GWAS Catalog', 'GRASP', 'GWAS DB', 'Phewas Catalog', 'VEP', 'Regulome', 'GTEx', 'DHS', 'Fantom5'])
header = "\t".join(['ld_snp_rsID', 'gene_symbol', 'gene_id', 'disease_names', 'disease_efo_ids', 'score', 'gwas_snp_ids', 'GWAS Catalog', 'GRASP', 'GWAS DB', 'Phewas Catalog', 'VEP', 'Regulome', 'GTEx', 'DHS', 'Fantom5', 'PCHIC'])
content = map(pretty_cluster_association, associations)
return "\n".join([header] + content)

Expand Down Expand Up @@ -2061,14 +2144,14 @@ def pretty_cluster_association(association):
results.append(",".join(gwas_snp.snp.rsID for gwas_snp in gwas_snps))
for gwas_source in ['GWAS Catalog', 'GRASP', 'GWAS DB', 'Phewas Catalog']:
results.append(",".join(str(gwas_scores[gwas_source][gwas_snp.snp.rsID]) for gwas_snp in cluster.gwas_snps))
results += [str(functional_scores[ld_snp.rsID][functional_source]) for functional_source in ['VEP', 'Regulome', 'GTEx', 'DHS', 'Fantom5']]
results += [str(functional_scores[ld_snp.rsID][functional_source]) for functional_source in ['VEP', 'Regulome', 'GTEx', 'DHS', 'Fantom5', 'PCHIC']]
pretty_strings.append("\t".join(results))

return "\n".join(pretty_strings)

# List of databases used
database_functions = [GWASCatalog, GWAS_DB, Phewas_Catalog, GRASP]
ld_snp_to_gene_functions = [GTEx, Fantom5, VEP, DHS]
ld_snp_to_gene_functions = [GTEx, Fantom5, VEP, DHS, PCHIC]
snp_regulatory_functions = [Regulome] # TODO Implement and insert GERP code

if __name__ == "__main__":
Expand Down

0 comments on commit 75876f7

Please sign in to comment.