rm(list=ls()) set.seed(666) library(stringdist) library(Biostrings) library(stringr) dat <- read.csv("~/Downloads/pnas.1802640115.sd01.csv",skip = 8, header = T, stringsAsFactors = F) #Select coligo sequences that are each differentiable by at least two. #I am choosing multiples of 96, because I want to concatenate the sequences #to make them much longer. The forward primer is 23sh at most bases long. #The forward MID is at most 10 bases long. Therefore, if I want a 250 base long seq, #I need to concate 12sh of the following sequences #making extra, because some will fail babyColigos <- sample(dat$Barcodes17.2[dat$Barcodes17.2 != ""], size = 96*40, replace = F) adultColigos <- vector() for(i in 1:(96*40/12)){ #pick 12 seqs for the coligo picks <- sample(babyColigos, size = 12, replace = F) #remove them from our options to choose from during the next iteration babyColigos <- babyColigos[!(babyColigos %in% picks)] #print(length(babyColigos)) #paste them together into a coligo adultColigos[i] <- paste(picks, sep = "",collapse = '') } #Calculate edit distances between all coligos. #Minimum distance is very high using this method. #Each 17 base mer should be at least an edit distance of two #apart. distances <- vector() for(i in 1:length(adultColigos)){ distances <- c(distances, stringdist(adultColigos[i], adultColigos[-i], method = "lv")) } min(distances) #Check to see if primers and ISD are in the coligos. hits <- vector() for(n in 1:110){ query <- substr("TGCCACAGATACGTACCGCTCATAACGCGAACCGAAGCGCAGTAGAAGTACTCCGTATCCTACCTCGGTCGTGGTTTAGGCTATCGACATCTTGCATGGGCTTCCCTAGTGAACTCTTGGGATGT", start = n, stop = n + 15) hits <- c(hits, agrep(query, adultColigos)) } unique(hits) #Remove the hits. adultColigos <- adultColigos[-unique(hits)] #Check for hits to the primers #GTGCCAGCAGCCGCGGTAA #CTTGGTCATTTAGAGGAAGTAA #GGACTACAAGGGTATCTAAT #GCTGCGTTCTTCATCGATGC seqChecker <- function(x, y = 5, mer = 15){ hits <- vector() for(n in 1:y){ query <- substr(x, start = n, stop = n + mer) hits <- c(hits, agrep(query, adultColigos, 2)) query <- substr(reverse(x), start = n, stop = n + mer) hits <- c(hits, agrep(query, adultColigos, 2)) query <- substr(as.character(Biostrings::complement(DNAString(x))), start = n, stop = n + mer) hits <- c(hits, agrep(query, adultColigos, 2)) query <- substr(as.character(Biostrings::reverseComplement(DNAString(x))), start = n, stop = n + mer) hits <- c(hits, agrep(query, adultColigos, 2)) } return(unique(hits)) } hits <- seqChecker("GTGCCAGCAGCCGCGGTAA") hits <- c(hits, seqChecker("CTTGGTCATTTAGAGGAAGTAA")) hits <- c(hits, seqChecker("GGACTACAAGGGTATCTAAT")) hits <- c(hits, seqChecker("GCTGCGTTCTTCATCGATGC")) #remove hits adultColigos <- adultColigos[-unique(hits)] #Calculate GC content and remove any baddies gc <- vector() for(i in 1:length(adultColigos)){ gc <- c(gc, str_count(adultColigos[i], "G")) } min(gc) max(gc) hist(gc) adultColigos <- adultColigos[gc > 40 & gc < 60] #Testing for N1, N2, RnaseP. hits <- seqChecker("GCGTTCTCCATTCTGGTTACT") hits <- c(hits, seqChecker("TGACTTCCATGCCAATGCG")) hits <- c(hits, seqChecker("TGAGCGGCTGTCTCCAC")) hits <- c(hits, seqChecker("CCCCAAAATCAGCGAAATG")) hits <- c(hits, seqChecker("GGAACTGATTACAAACATTGGC")) hits <- c(hits, seqChecker("TTTGGACCTGCGAGCG")) #Check for Illumina adaptors hits <- c(hits, agrep("AATGATACGGCGACCA", adultColigos, 3)) hits <- c(hits, agrep("GGCGACCACCGAGATCT", adultColigos, 3)) hits <- c(hits, agrep("ACACTCGTCGGCAGCGTC", adultColigos, 3)) hits <- c(hits, agrep("CAAGCAGAAGAC", adultColigos, 3)) hits <- c(hits, agrep("GACGGCATACGAGATG", adultColigos, 3)) hits <- c(hits, agrep("ATGTCTCGTGGGCTCGG", adultColigos, 3)) hits <- c(hits, seqChecker("AATGATACGGCGACCACCGAGATCTACACTCGTCGGCAGCGTC", y = 28)) hits <- c(hits, seqChecker("CAAGCAGAAGACGGCATACGAGATGTCTCGTGGGCTCGG", y = 25)) adultColigos <- adultColigos[-unique(hits)] adultColigos <- adultColigos[1:96] #Turn the coligos into a fasta fasta <- vector() k <- 1 for(i in 1:length(adultColigos)){ fasta[k] <- paste(">Coligo", i, sep = "") fasta[k+1] <- adultColigos[i] k <- k + 2 } write.table(fasta, file = "coligoSequences_longerVersion.fasta", row.names = F) out <- data.frame(rep("GTG[CT]CAGC[AC]GCCGCGGTAA", length(adultColigos)), #515 rep("CTTGGTCATTTAGAGGAAGTAA", length(adultColigos)), #its1f adultColigos, rep("GCTGCGTTCTTCATCGATGC", length(adultColigos)), #its2 rep("GGACTAC[ACT][ACG]GGGT[AT]TCTAAT", length(adultColigos)) #806 ) names(out) <- c("primer_515", "primer_ITS1f", "coligo", "primer_ITS2","primer_806") write.csv(out, file = "coligoSequences_longerVersion.csv", row.names = F)