Skip to content

Commit

Permalink
submit scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
zgturan committed Aug 8, 2023
1 parent 88fefd4 commit edd313b
Show file tree
Hide file tree
Showing 72 changed files with 3,980 additions and 1 deletion.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1 +1,4 @@
# scWGA_comp
# README
This repository includes the code for "Single-cell somatic copy number variants in brain using different amplification methods and reference genomes" paper.

* Description of the scripts: [README](scripts/README.md)
13 changes: 13 additions & 0 deletions scWGA_comp.Rproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Version: 1.0

RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: Sweave
LaTeX: pdfLaTeX
16 changes: 16 additions & 0 deletions scripts/00_Setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
mkdir results
mkdir scripts
mkdir data
cd data
mkdir raw
mkdir processed
cd processed
mkdir bam
mkdir bed
mkdir sam
mkdir trimmed
mkdir txt
mkdir rds
mkdir coverage
cd ../../

70 changes: 70 additions & 0 deletions scripts/01_Setup.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
suppressPackageStartupMessages(library(rrvgo))
suppressPackageStartupMessages(library(org.Hs.eg.db))
suppressPackageStartupMessages(library(mgsub))
suppressPackageStartupMessages(library(ggforce))
suppressPackageStartupMessages(library(ggpubr))
suppressPackageStartupMessages(library(ggridges))
suppressPackageStartupMessages(library(ggrepel))
suppressPackageStartupMessages(library(igraph))
suppressPackageStartupMessages(library(patchwork))
suppressPackageStartupMessages(library(readxl))
suppressPackageStartupMessages(library(unikn))
suppressPackageStartupMessages(library(gridExtra))
suppressPackageStartupMessages(library(grid))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(plyr))
suppressPackageStartupMessages(library(inline))
suppressPackageStartupMessages(library(kableExtra))
suppressPackageStartupMessages(library(GenomicRanges))
suppressPackageStartupMessages(library(biomaRt))
suppressPackageStartupMessages(library(RSQLite))
suppressPackageStartupMessages(library(wesanderson))
suppressPackageStartupMessages(library(ctc))
suppressPackageStartupMessages(library(gplots)) # Visual plotting of tables
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(nortest))
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(devtools))
suppressPackageStartupMessages(library(bedr))
suppressPackageStartupMessages(library(rtracklayer))
suppressPackageStartupMessages(library(broom))
suppressPackageStartupMessages(library(reshape2))
suppressPackageStartupMessages(library(gridBase))
suppressPackageStartupMessages(library(formattable))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(openxlsx))
suppressPackageStartupMessages(library(MASS))
suppressPackageStartupMessages(library(ggfortify))
suppressPackageStartupMessages(library(pals))
suppressPackageStartupMessages(library(viridis))
suppressPackageStartupMessages(library(scales))
suppressPackageStartupMessages(library(tilingArray))
suppressPackageStartupMessages(library(effsize))
suppressPackageStartupMessages(library(nlme))
suppressPackageStartupMessages(library(broom.mixed))
suppressPackageStartupMessages(library(tidytext))
suppressPackageStartupMessages(library(qpcR))
suppressPackageStartupMessages(library(cluster)) # clustering algorithms
suppressPackageStartupMessages(library(UpSetR))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(ezfun))
suppressPackageStartupMessages(library(pheatmap))
suppressPackageStartupMessages(library(R.filesets))
suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(cowplot))
suppressPackageStartupMessages(library(colorspace))
suppressPackageStartupMessages(library(ggimage))
suppressPackageStartupMessages(library(ggstatsplot))
suppressPackageStartupMessages(library(PMCMRplus))
suppressPackageStartupMessages(library(mixtools))

theme_set(theme_pubr(base_size = 12, legend = 'top'))

pntnorm <- (1/0.352777778)

ampmethod <- setNames(c("#0078bf" ,'#f08122', '#5c2161'), c('PTA','dMDA','PicoPLEX'))
diagnoses <- setNames(c('#4b4b45', '#a61f56'), c('Control', 'MSA'))
brain_reg <- setNames(c('#edae49','#66a182'), c('Cingulate Cortex', 'Frontal Cortex'))

standardize_name <- function(name) {
return(gsub('[^0-9a-zA-Z]+', '_', name))}
7 changes: 7 additions & 0 deletions scripts/02_SampleInfo.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# by: Zeliha Gozde Turan
# date: Feb 25, 2023

source('./scripts/01_Setup.R')
sample_infA= data.frame(read_xlsx('./data/processed/txt/Christos_Brain_dataset_manifest.xlsx', sheet = 1))
write.table(sample_infA, file = "./data/processed/txt/SampleInfo.tsv", row.names=FALSE, sep="\t")
saveRDS(sample_infA, file = "./data/processed/rds/SampleInfo.rds")
73 changes: 73 additions & 0 deletions scripts/03_PrepareRData.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
args <- commandArgs(trailingOnly = T)
filenamex=as.character(args[[1]])

source('./scripts/01_Setup.R')

raw_dMDA_5Mb = read.table(file.path('./data/processed',filenamex, 'data'), header=TRUE, sep="\t")
saveRDS(raw_dMDA_5Mb, file = file.path('./data/processed/rds', paste0('raw_', filenamex, '.rds')))

CNV1= read.table(file.path('./data/processed', filenamex, 'CNV1'))
colnames(CNV1)= NULL
rownames(CNV1)= NULL
CNV1= data.frame(CNV1)
colnames(CNV1)=c('chr','start','end','id','cnv')
CNV1$chr= as.character(CNV1$chr)
CNV1$id= as.character(CNV1$id)
CNV1$data=rep(filenamex, nrow(CNV1))
CNV1$width=(CNV1$end - CNV1$start)+1
CNV1= CNV1[CNV1$chr %in% c(paste0("chr",1:22),'chrX','chrY'),]
CNV1= CNV1[!CNV1$start%in%0,]
saveRDS(CNV1, file = file.path('./data/processed/rds', paste0('CNV1_', filenamex, '.rds')))

SegNorm = read.table(file.path('./data/processed/', filenamex, 'SegNorm'), header=TRUE)
saveRDS(SegNorm, file = file.path('./data/processed/rds', paste0('SegNorm_', filenamex, '.rds')))
loca= SegNorm[,1:3]
saveRDS(loca, file= file.path('./data/processed/rds', paste0('location_', filenamex, '.rds')))

# locax for bedtools subtract to find differences
locax= loca[loca$CHR %in% paste0("chr",1:22),]
write.table(locax, file= file.path(paste0('./data/processed/diploid/',filenamex,'/output'), paste0('location_', filenamex, '.bed')),
quote = FALSE,sep = "\t",
row.names = FALSE,col.names = FALSE)

SegCopy = read.table(file.path('./data/processed', filenamex, 'SegCopy'), header=TRUE)
saveRDS(SegCopy, file = file.path('./data/processed/rds', paste0('SegCopy_', filenamex, '.rds')))

SegBreaks = read.table(file.path('./data/processed', filenamex, 'SegBreaks'), header=TRUE, sep="\t")
saveRDS(SegBreaks, file = file.path('./data/processed/rds', paste0('SegBreaks_', filenamex, '.rds')))

SegFix = read.table(file.path('./data/processed', filenamex, 'SegFixed'), header=TRUE, sep="\t")
saveRDS(SegFix, file = file.path('./data/processed/rds', paste0('SegFixed_', filenamex, '.rds')))

resultsx = read_tsv(file.path('./data/processed', filenamex, 'results.txt'))
resultsxx= as.data.frame(resultsx[,1:2])
resultsxx= resultsxx[!is.na(resultsxx[,'Copy_Number']),]
saveRDS(resultsxx, file = file.path('./data/processed/rds', paste0('cell_cn_', filenamex, '.rds')))

SegStats = read.table(file.path('./data/processed/', filenamex, 'SegStats'), header=TRUE, sep="\t")
segstat1= data.frame(rownames(SegStats), SegStats[,'Reads'], SegStats[,'Disp'], rep(filenamex,length(rownames(SegStats))))
saveRDS(segstat1, file = file.path('./data/processed/rds', paste0('SegStats_', filenamex, '.rds')))

###########
filenamesx= c("dMDA_101_5Mb.rds",
"dMDA_101_5Mb_lift.rds",
"dMDA_101_5Mb_t2t.rds",

"Pico_101_250kb_lift.rds",
"Pico_101_250kb_t2t.rds",
"Pico_101_250kb.rds",

"Pico_76_250kb_lift.rds",
"Pico_76_250kb_t2t.rds",
"Pico_76_250kb.rds",

"PTA_101_500kb_lift.rds",
"PTA_101_500kb_t2t.rds",
"PTA_101_500kb.rds",

"PTA_48_500kb_lift.rds",
"PTA_48_500kb_t2t.rds",
"PTA_48_500kb.rds")

saveRDS(filenamesx, file = './data/processed/rds/filenamesx.rds')
############
24 changes: 24 additions & 0 deletions scripts/04_ConfidenceScore.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
args <- commandArgs(trailingOnly = T)
filenamex=as.character(args[[1]])

turan_SegNorm = read.table(file.path('./data/processed/', filenamex, 'SegFixed'), header=TRUE, sep="\t")
turan_SegNorm$CHR= as.character(turan_SegNorm$CHR)
turan_ploidy <- readRDS(file.path('./data/processed/rds', paste0('cell_cn_', filenamex, '.rds')))
turan_ploidy= data.frame(id= turan_ploidy[,'Sample'], predicted_ploidy=as.numeric(turan_ploidy[,'Copy_Number']))
turan_ploidy$id= as.character(turan_ploidy$id)
identical(colnames(turan_SegNorm[,4:ncol(turan_SegNorm)]), turan_ploidy$id)
turan_clouds= sweep(turan_SegNorm[,4:ncol(turan_SegNorm)], 2, turan_ploidy$predicted_ploidy, "*")
saveRDS(turan_clouds, file= file.path('./data/processed/rds', paste0(filenamex, '_SegFixed.rds')))

loca= turan_SegNorm[turan_SegNorm$CHR%in%paste0('chr', 1:22),]
autosomes= nrow(loca)
turan_clouds= turan_clouds[1:autosomes,]
confx= c()
for (i in 1:ncol(turan_clouds)){
cellx= turan_clouds[,i]
CS = 1 - 2*(median(abs(cellx-round(cellx)), na.rm = T))
confx= c(confx, CS)}
confx2= data.frame(colnames(turan_clouds), confx, rep(filenamex, length(confx)))
colnames(confx2)= c('cellid', "confidence_score", "data")

saveRDS(confx2, file= file.path('./data/processed/rds', paste0(filenamex, '_ConfiScore.rds')))
22 changes: 22 additions & 0 deletions scripts/05_Preseq.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
args <- commandArgs(trailingOnly = T)
filenamex=as.character(args[[1]])

options(scipen=999)
outpath= "./data/processed/preseq"
source('./scripts/01_Setup.R')
if (file.exists(file.path(outpath, filenamex))) {
setwd(file.path(outpath, filenamex))
samples= list.files()
xx=c()
for (i in 1:length(samples)){
# print(i)
filex= data.frame(read.delim(samples[i]))
filex2= median(filex$EXPECTED_COVERED_BASES)
names(filex2)= samples[i]
xx=c(xx, filex2)
}
xxa= data.frame(xx)
xx2= data.frame(rownames(xxa), xxa[,1], rep(filenamex, length(samples)))
colnames(xx2)= c('id', "median_expected_covered_bases", "data")
saveRDS(xx2, file= file.path('./data/processed/rds', paste0(filenamex, '_preseq.rds')))
}
11 changes: 11 additions & 0 deletions scripts/06_SegFixed.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
args <- commandArgs(trailingOnly = T)
filenamex=as.character(args[[1]])

turan_SegNorm = read.table(file.path('./data/processed/', filenamex, 'SegFixed'), header=TRUE, sep="\t")
turan_SegNorm$CHR= as.character(turan_SegNorm$CHR)
turan_ploidy <- readRDS(file.path('./data/processed/rds', paste0('cell_cn_', filenamex, '.rds')))
turan_ploidy= data.frame(id= turan_ploidy[,'Sample'], predicted_ploidy=as.numeric(turan_ploidy[,'Copy_Number']))
turan_ploidy$id= as.character(turan_ploidy$id)
identical(colnames(turan_SegNorm[,4:ncol(turan_SegNorm)]), turan_ploidy$id)
turan_clouds= sweep(turan_SegNorm[,4:ncol(turan_SegNorm)], 2, turan_ploidy$predicted_ploidy, "*")
saveRDS(turan_clouds, file= file.path('./data/processed/rds', paste0(filenamex, '_SegFixed_clouds_genome.rds')))
60 changes: 60 additions & 0 deletions scripts/07_CNVStat_Filter_SegFixed.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
args <- commandArgs(trailingOnly = T)
filenamex=as.character(args[[1]])

exclude_files <- c("dMDA_101_5Mb_t2t", "Pico_101_250kb_t2t", "Pico_76_250kb_t2t", "PTA_101_500kb_t2t", "PTA_48_500kb_t2t",
'Pico_76_250kb_t2t_q1','Pico_76_250kb_t2t_q10','Pico_101_250kb_t2t_q1','Pico_101_250kb_t2t_q10',
'PTA_48_500kb_t2t_q1','PTA_48_500kb_t2t_q10','PTA_101_500kb_t2t_q1','PTA_101_500kb_t2t_q10')
if (!(filenamex %in% exclude_files)){
turan_CNV1_svd= readRDS(file.path('./data/processed/rds', paste0('CNV1_', filenamex,'.rds')))
CNV1= turan_CNV1_svd
number_of_CN= CNV1[(CNV1$chr %in% c(paste0("chr",1:22), "chrX")),]
number_of_CN5=number_of_CN
saveRDS(number_of_CN5, file= file.path('./data/processed/rds', paste0(filenamex, '_number_of_CN5_genome.rds')))
########################################################################
turan_Clouds <- readRDS(file.path('./data/processed/rds', paste0(filenamex, '_SegFixed_clouds_genome.rds')))
pos <- readRDS(file.path('./data/processed/rds', paste0('location_', filenamex, '.rds')))
pos$CHR= as.character(pos$CHR)
turan_Clouds2= cbind(pos, turan_Clouds)
turan_Clouds3= turan_Clouds2[(turan_Clouds2$CHR %in% c(paste0("chr",1:22), "chrX")),]
turan_Clouds3= turan_Clouds3[!turan_Clouds3$CHR%in%1,]
pos= pos[(pos$CHR %in% c(paste0("chr",1:22), "chrX")),]
pos= pos[!pos$START%in%0,]
###########################################################
number_of_CN6 = rbind()
for (x in 1:dim(number_of_CN5)[1]){
idx= number_of_CN5$id[x]
datax= unique(number_of_CN5[number_of_CN5$id%in%idx,'data'])
cnv_value = turan_Clouds3[(colnames(turan_Clouds3) %in% number_of_CN5$id[x])]
cnv_value2 = data.frame(pos, cnv_value)
chr = cnv_value2[cnv_value2$CHR %in% number_of_CN5$chr[x],]
chrx= unique(chr$CHR)
start_pos = chr[(chr$START %in% number_of_CN5$start[x]),]
start_posx = chr[(chr$START %in% number_of_CN5$start[x]),'START']
end_pos = chr[(chr$END %in% number_of_CN5$end[x]),]
end_posx = chr[(chr$END %in% number_of_CN5$end[x]),'END']
cnv_mean = mean(cnv_value[as.numeric(rownames(start_pos)[1]):as.numeric(rownames(end_pos)[1]),])
cnv_sd = sd(cnv_value[as.numeric(rownames(start_pos)[1]):as.numeric(rownames(end_pos)[1]),])
cnv_median = median(cnv_value[as.numeric(rownames(start_pos)[1]):as.numeric(rownames(end_pos)[1]),])
cnv_binsize = length(cnv_value[as.numeric(rownames(start_pos)[1]):as.numeric(rownames(end_pos)[1]),])
start_bin = as.numeric(rownames(start_pos)[1])
end_bin = as.numeric(rownames(end_pos)[1])
#effect size
cnv = number_of_CN5$cnv[x]
cnv_ss = cnv - cnv_mean
cnv_ss_t = cnv_ss / cnv_sd
aaa = c(chrx, start_posx, end_posx, idx, cnv, datax, cnv_mean, cnv_sd, cnv_median, cnv_binsize, cnv_ss_t, start_bin, end_bin)
number_of_CN6 = rbind(number_of_CN6, aaa)}
colnames(number_of_CN6)= c('chr','start','end', 'id', 'cnv','data','cn_mean','cn_sd','cn_median','cn_binsize','z2score','start_bin','end_bin')
rownames(number_of_CN6) = NULL
number_of_CN6=data.frame(number_of_CN6)
number_of_CN6$start=as.numeric(number_of_CN6$start)
number_of_CN6$end=as.numeric(number_of_CN6$end)
number_of_CN6$cnv=as.numeric(number_of_CN6$cnv)
number_of_CN6$cn_mean=as.numeric(number_of_CN6$cn_mean)
number_of_CN6$cn_sd= as.numeric(number_of_CN6$cn_sd)
number_of_CN6$cn_median= as.numeric(number_of_CN6$cn_median)
number_of_CN6$cn_binsize= as.numeric(number_of_CN6$cn_binsize)
number_of_CN6$z2score= as.numeric(number_of_CN6$z2score)
number_of_CN6$start_bin= as.numeric(number_of_CN6$start_bin)
number_of_CN6$end_bin= as.numeric(number_of_CN6$end_bin)
saveRDS(number_of_CN6, file= file.path('./data/processed/rds', paste0(filenamex, '_SegFixed_cnv_stat_genome.rds')))}
60 changes: 60 additions & 0 deletions scripts/08_CNVStat_Filter_SegFixed_T2T.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
args <- commandArgs(trailingOnly = T)
filenamex=as.character(args[[1]])

include_files <- c("dMDA_101_5Mb_t2t", "Pico_101_250kb_t2t", "Pico_76_250kb_t2t", "PTA_101_500kb_t2t", "PTA_48_500kb_t2t")
if ((filenamex %in% include_files)){
turan_CNV1_svd= readRDS(file.path('./data/processed/rds', paste0('CNV1_', filenamex,'.rds')))
CNV1= turan_CNV1_svd
number_of_CN= CNV1[(CNV1$chr %in% paste0("chr",1:22)),]
number_of_CN5=number_of_CN
saveRDS(number_of_CN5, file= file.path('./data/processed/rds', paste0(filenamex, '_number_of_CN5_genome.rds')))
########################################################################
turan_Clouds <- readRDS(file.path('./data/processed/rds', paste0(filenamex, '_SegFixed_clouds_genome.rds')))
pos <- readRDS(file.path('./data/processed/rds', paste0('location_', filenamex, '.rds')))
pos$CHR= as.character(pos$CHR)
turan_Clouds2= cbind(pos, turan_Clouds)
turan_Clouds3= turan_Clouds2[(turan_Clouds2$CHR %in% paste0("chr",1:22)),]
turan_Clouds3= turan_Clouds3[!turan_Clouds3$CHR%in%1,]
pos= pos[(pos$CHR %in% paste0("chr",1:22)),]
pos= pos[!pos$START%in%0,]
###########################################################
number_of_CN6 = rbind()
for (x in 1:dim(number_of_CN5)[1]){
idx= number_of_CN5$id[x]
datax= unique(number_of_CN5[number_of_CN5$id%in%idx,'data'])
cnv_value = turan_Clouds3[(colnames(turan_Clouds3) %in% number_of_CN5$id[x])]
cnv_value2 = data.frame(pos, cnv_value)
chr = cnv_value2[cnv_value2$CHR %in% number_of_CN5$chr[x],]
chrx= unique(chr$CHR)
start_pos = chr[(chr$START %in% number_of_CN5$start[x]),]
start_posx = chr[(chr$START %in% number_of_CN5$start[x]),'START']
end_pos = chr[(chr$END %in% number_of_CN5$end[x]),]
end_posx = chr[(chr$END %in% number_of_CN5$end[x]),'END']
cnv_mean = mean(cnv_value[as.numeric(rownames(start_pos)[1]):as.numeric(rownames(end_pos)[1]),])
cnv_sd = sd(cnv_value[as.numeric(rownames(start_pos)[1]):as.numeric(rownames(end_pos)[1]),])
cnv_median = median(cnv_value[as.numeric(rownames(start_pos)[1]):as.numeric(rownames(end_pos)[1]),])
cnv_binsize = length(cnv_value[as.numeric(rownames(start_pos)[1]):as.numeric(rownames(end_pos)[1]),])
start_bin = as.numeric(rownames(start_pos)[1])
end_bin = as.numeric(rownames(end_pos)[1])
#effect size
cnv = number_of_CN5$cnv[x]
cnv_ss = cnv - cnv_mean
cnv_ss_t = cnv_ss / cnv_sd
aaa = c(chrx, start_posx, end_posx, idx, cnv, datax, cnv_mean, cnv_sd, cnv_median, cnv_binsize, cnv_ss_t, start_bin, end_bin)
number_of_CN6 = rbind(number_of_CN6, aaa)}

colnames(number_of_CN6)= c('chr','start','end', 'id', 'cnv','data','cn_mean','cn_sd','cn_median','cn_binsize','z2score','start_bin','end_bin')
rownames(number_of_CN6) = NULL
number_of_CN6=data.frame(number_of_CN6)
number_of_CN6$start=as.numeric(number_of_CN6$start)
number_of_CN6$end=as.numeric(number_of_CN6$end)
number_of_CN6$cnv=as.numeric(number_of_CN6$cnv)
number_of_CN6$cn_mean=as.numeric(number_of_CN6$cn_mean)
number_of_CN6$cn_sd= as.numeric(number_of_CN6$cn_sd)
number_of_CN6$cn_median= as.numeric(number_of_CN6$cn_median)
number_of_CN6$cn_binsize= as.numeric(number_of_CN6$cn_binsize)
number_of_CN6$z2score= as.numeric(number_of_CN6$z2score)
number_of_CN6$start_bin= as.numeric(number_of_CN6$start_bin)
number_of_CN6$end_bin= as.numeric(number_of_CN6$end_bin)

saveRDS(number_of_CN6, file= file.path('./data/processed/rds', paste0(filenamex, '_SegFixed_cnv_stat_genome.rds')))}
Loading

0 comments on commit edd313b

Please sign in to comment.