rm(list=ls()) set.seed(666) library(stringdist) library(Biostrings) library(stringr) dat <- read.csv("~/Downloads/pnas.1802640115.sd01.csv",skip = 8, header = T, stringsAsFactors = F) #Select coligo sequences that are each differentiable by at least two. #I am choosing multiples of 96, because I want to concatenate the sequences #to make them much longer. The forward primer is 23sh at most bases long. #The forward MID is at most 10 bases long. Therefore, if I want a 250 base long seq, #I need to concate 12sh of the following sequences #making extra, because some will fail babyColigos <- sample(dat$Barcodes17.2[dat$Barcodes17.2 != ""], size = 96*20, replace = F) adultColigos <- vector() for(i in 1:(96*20/12)){ #pick 12 seqs for the coligo picks <- sample(babyColigos, size = 12, replace = F) #remove them from our options to choose from during the next iteration babyColigos <- babyColigos[!(babyColigos %in% picks)] print(length(babyColigos)) #paste them together into a coligo adultColigos[i] <- paste(picks, sep = "",collapse = '') } #Calculate edit distances between all coligos. #Minimum distance is very high using this method. #Each 17 base mer should be at least an edit distance of two #apart. distances <- vector() for(i in 1:length(adultColigos)){ distances <- c(distances, stringdist(adultColigos[i], adultColigos[-i], method = "lv")) } min(distances) #Check to see if primers and ISD are in the coligos. hits <- vector() for(n in 1:110){ query <- substr("TGCCACAGATACGTACCGCTCATAACGCGAACCGAAGCGCAGTAGAAGTACTCCGTATCCTACCTCGGTCGTGGTTTAGGCTATCGACATCTTGCATGGGCTTCCCTAGTGAACTCTTGGGATGT", start = n, stop = n + 15) hits <- c(hits, agrep(query, adultColigos)) } unique(hits) #Remove the hits. adultColigos <- adultColigos[-unique(hits)] #Check for hits to the primers #GTGCCAGCAGCCGCGGTAA #CTTGGTCATTTAGAGGAAGTAA #GGACTACAAGGGTATCTAAT #GCTGCGTTCTTCATCGATGC seqChecker <- function(x){ hits <- vector() for(n in 1:5){ query <- substr(x, start = n, stop = n + 15) hits <- c(hits, agrep(query, adultColigos, 2)) query <- substr(reverse(x), start = n, stop = n + 15) hits <- c(hits, agrep(query, adultColigos, 2)) query <- substr(as.character(Biostrings::complement(DNAString(x))), start = n, stop = n + 15) hits <- c(hits, agrep(query, adultColigos, 2)) query <- substr(as.character(Biostrings::reverseComplement(DNAString(x))), start = n, stop = n + 15) hits <- c(hits, agrep(query, adultColigos, 2)) } return(unique(hits)) } hits <- seqChecker("GTGCCAGCAGCCGCGGTAA") hits <- c(hits, seqChecker("CTTGGTCATTTAGAGGAAGTAA")) hits <- c(hits, seqChecker("GGACTACAAGGGTATCTAAT")) hits <- c(hits, seqChecker("GCTGCGTTCTTCATCGATGC")) #remove hits adultColigos <- adultColigos[-unique(hits)] #Calculate GC content and remove any baddies gc <- vector() for(i in 1:length(adultColigos)){ gc <- c(gc, str_count(adultColigos[i], "G")) } min(gc) max(gc) hist(gc) adultColigos <- adultColigos[gc > 40 & gc < 60] #Testing for N1, N2, RnaseP. WILL NEED CHANGED FOR THESE. hits <- seqChecker("GCGTTCTCCATTCTGGTTACT") hits <- c(hits, seqChecker("TGACTTCCATGCCAATGCG")) hits <- c(hits, seqChecker("TGAGCGGCTGTCTCCAC")) hits <- c(hits, seqChecker("CCCCAAAATCAGCGAAATG")) hits <- c(hits, seqChecker("GGAACTGATTACAAACATTGGC")) hits <- c(hits, seqChecker("TTTGGACCTGCGAGCG")) adultColigos <- adultColigos[-unique(hits)] adultColigos <- adultColigos[1:96] #Turn the coligos into a fasta fasta <- vector() k <- 1 for(i in 1:length(adultColigos)){ fasta[k] <- paste(">Coligo", i, sep = "") fasta[k+1] <- adultColigos[i] k <- k + 2 } write.table(fasta, file = "coligoSequences_longerVersion.fasta", row.names = F) #write.csv(adultColigos, file = "coligoSequences_longerVersion.csv")