diff --git a/InDevOptimizations/DemultiplexUsingBarcodes_New_V1.py b/InDevOptimizations/DemultiplexUsingBarcodes_New_V1.py index 95915f8..d6a2152 100644 --- a/InDevOptimizations/DemultiplexUsingBarcodes_New_V1.py +++ b/InDevOptimizations/DemultiplexUsingBarcodes_New_V1.py @@ -19,7 +19,7 @@ ##### # Set consistent parameters here Round1_barcode_staticSeq = "CATTCG" -Round2_barcode_staticSeq = "AATCCA" +Round2_barcode_staticSeq = "ATCCAC" Round3_barcode_staticSeq = "GTGGCC" ##### @@ -146,6 +146,54 @@ def return_fastq(self): Total_barcodes_detected = [] Total_barcodes_passing_minReadThreshold = [] +###### +# Learn barcode positions from input fastqR +###### +print("Learning barcode positions...") + +#Set Default Positions +umi_start=0 +umi_end=10 +barcode3_start=10 +barcode3_end=18 +barcode2_start=48 +barcode2_end=int(48+8) +barcode3_start=86 +barcode3_end=int(86+8) + +# Code for automated barcode position extractor based on static sequences +line_ct_Learner = 0 +learner_bc1_list = [] +learner_bc2_list = [] +learner_bc3_list = [] +with open("position_learner_fastqr.fastq", "r") as infile: + for line in infile: + if (line_ct_Learner % 4 == 1): + learner_bc1_list.append(line.find(Round1_barcode_staticSeq)) + learner_bc2_list.append(line.find(Round2_barcode_staticSeq)) + learner_bc3_list.append(line.find(Round3_barcode_staticSeq)) + line_ct_Learner += 1 + foundPosition_Round1_barcode=max(set(learner_bc1_list), key=learner_bc1_list.count) + foundPosition_Round2_barcode=max(set(learner_bc2_list), key=learner_bc2_list.count) + foundPosition_Round3_barcode=max(set(learner_bc3_list), key=learner_bc3_list.count) + print("Extracted position1 = " + str(foundPosition_Round1_barcode)) + print("Extracted position2 = " + str(foundPosition_Round2_barcode)) + print("Extracted position3 = " + str(foundPosition_Round3_barcode)) + # Use extracted static sequence positions to infer barcode positions + umi_start=int(foundPosition_Round3_barcode - 18) + umi_end=int(foundPosition_Round3_barcode - 8) + print("UMI position has been extracted as " + str(umi_start) + ":" + str(umi_end)) + barcode3_start=int(foundPosition_Round3_barcode - 8) + barcode3_end=int(foundPosition_Round3_barcode) + print("Barcode3 position has been extracted as " + str(barcode3_start) + ":" + str(barcode3_end)) + barcode2_start=int(foundPosition_Round2_barcode - 8) + barcode2_end=int(foundPosition_Round2_barcode) + print("Barcode2 position has been extracted as " + str(barcode2_start) + ":" + str(barcode2_end)) + barcode1_start=int(foundPosition_Round1_barcode + 6) + barcode1_end=int(foundPosition_Round1_barcode + 14) + print("Barcode1 position has been extracted as " + str(barcode1_start) + ":" + str(barcode1_end)) + + ###### # Step2: Iterate through input fastqs in bins. @@ -228,10 +276,14 @@ def return_fastq(self): completeReadCounter += 1 if (line_ct1 % 4 == 1): lineRead=str(line[0:].rstrip()) - lineReadUMI = lineRead[0:10] - lineReadBarcode3 = lineRead[10:18] - lineReadBarcode2 = lineRead[48:int(48+8)] - lineReadBarcode1 = lineRead[86:int(86+8)] + #lineReadUMI = lineRead[0:10] + lineReadUMI = lineRead[umi_start:umi_end] + #lineReadBarcode3 = lineRead[10:18] + lineReadBarcode3 = lineRead[barcode3_start:barcode3_end] + #lineReadBarcode2 = lineRead[48:int(48+8)] + lineReadBarcode2 = lineRead[barcode2_start:barcode2_end] + #lineReadBarcode1 = lineRead[86:int(86+8)] + lineReadBarcode1 = lineRead[barcode1_start:barcode1_end] filteredBarcode1 = [s for s in Eight_BP_barcode if hamming(s, lineReadBarcode1) <= int(args.errorThreshold)] # Match each extracted barcode to a greenlist of possible barcodes. If a match within hamming distance of 1 is found move forward with that match (not the extracted sequence). filteredBarcode2 = [s for s in Eight_BP_barcode if hamming(s, lineReadBarcode2) <= int(args.errorThreshold)] filteredBarcode3 = [s for s in Eight_BP_barcode if hamming(s, lineReadBarcode3) <= int(args.errorThreshold)]